diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index ca52c82e5bb5..f7b69a0e71e1 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1188,7 +1188,7 @@ defined: return -ECHILD and it will be called again in ref-walk mode. -``_weak_revalidate`` +``d_weak_revalidate`` called when the VFS needs to revalidate a "jumped" dentry. This is called when a path-walk ends at dentry that was not acquired by doing a lookup in the parent directory. This includes "/", diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 87cf5c010d5d..ed2e45f9b762 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -2923,7 +2923,7 @@ Produces:: bash-1994 [000] .... 4342.324898: ima_get_action <-process_measurement bash-1994 [000] .... 4342.324898: ima_match_policy <-ima_get_action bash-1994 [000] .... 4342.324899: do_truncate <-do_last - bash-1994 [000] .... 4342.324899: should_remove_suid <-do_truncate + bash-1994 [000] .... 4342.324899: setattr_should_drop_suidgid <-do_truncate bash-1994 [000] .... 4342.324899: notify_change <-do_truncate bash-1994 [000] .... 4342.324900: current_fs_time <-notify_change bash-1994 [000] .... 4342.324900: current_kernel_time <-current_fs_time diff --git a/Makefile b/Makefile index e6b09052f222..71caf5938361 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 10 -SUBLEVEL = 175 +SUBLEVEL = 176 EXTRAVERSION = NAME = Dare mighty things diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c index 0b4965573656..88bacf4999c4 100644 --- a/arch/s390/boot/ipl_report.c +++ b/arch/s390/boot/ipl_report.c @@ -57,11 +57,19 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps, if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) safe_addr = INITRD_START + INITRD_SIZE; + if (intersects(safe_addr, size, (unsigned long)comps, comps->len)) { + safe_addr = (unsigned long)comps + comps->len; + goto repeat; + } for_each_rb_entry(comp, comps) if (intersects(safe_addr, size, comp->addr, comp->len)) { safe_addr = comp->addr + comp->len; goto repeat; } + if (intersects(safe_addr, size, (unsigned long)certs, certs->len)) { + safe_addr = (unsigned long)certs + certs->len; + goto repeat; + } for_each_rb_entry(cert, certs) if (intersects(safe_addr, size, cert->addr, cert->len)) { safe_addr = cert->addr + cert->len; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 1906387a0faf..0b7c81389c50 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2309,6 +2309,7 @@ static void mce_restart(void) { mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); + mce_schedule_work(); } /* Toggle features for corrected errors */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 91371b01eae0..c165ddbb672f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2998,7 +2998,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3024,6 +3024,13 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; + if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) + return -EINVAL; + + if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || + CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -3035,7 +3042,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || CC(((vmcs12->guest_cr0 & X86_CR0_PG) && diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 011e042b47ba..5ec47af786dd 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -586,7 +586,8 @@ void __init sme_enable(struct boot_params *bp) cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | ((u64)bp->ext_cmd_line_ptr << 32)); - cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) + return; if (!strncmp(buffer, cmdline_on, sizeof(buffer))) sme_me_mask = me_mask; diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 40c53632512b..9617688b58b3 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -16,13 +16,7 @@ menuconfig BLK_DEV if BLK_DEV -config BLK_DEV_NULL_BLK - tristate "Null test block driver" - select CONFIGFS_FS - -config BLK_DEV_NULL_BLK_FAULT_INJECTION - bool "Support fault injection for Null test block driver" - depends on BLK_DEV_NULL_BLK && FAULT_INJECTION +source "drivers/block/null_blk/Kconfig" config BLK_DEV_FD tristate "Normal floppy disk support" diff --git a/drivers/block/Makefile b/drivers/block/Makefile index e1f63117ee94..a3170859e01d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,12 +41,7 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o -null_blk-objs := null_blk_main.o -ifeq ($(CONFIG_BLK_DEV_ZONED), y) -null_blk-$(CONFIG_TRACING) += null_blk_trace.o -endif -null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h deleted file mode 100644 index 7de703f28617..000000000000 --- a/drivers/block/null_blk.h +++ /dev/null @@ -1,137 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BLK_NULL_BLK_H -#define __BLK_NULL_BLK_H - -#undef pr_fmt -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/blkdev.h> -#include <linux/slab.h> -#include <linux/blk-mq.h> -#include <linux/hrtimer.h> -#include <linux/configfs.h> -#include <linux/badblocks.h> -#include <linux/fault-inject.h> - -struct nullb_cmd { - struct request *rq; - struct bio *bio; - unsigned int tag; - blk_status_t error; - struct nullb_queue *nq; - struct hrtimer timer; - bool fake_timeout; -}; - -struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; - struct nullb_device *dev; - unsigned int requeue_selection; - - struct nullb_cmd *cmds; -}; - -struct nullb_device { - struct nullb *nullb; - struct config_item item; - struct radix_tree_root data; /* data stored in the disk */ - struct radix_tree_root cache; /* disk cache data */ - unsigned long flags; /* device flags */ - unsigned int curr_cache; - struct badblocks badblocks; - - unsigned int nr_zones; - unsigned int nr_zones_imp_open; - unsigned int nr_zones_exp_open; - unsigned int nr_zones_closed; - struct blk_zone *zones; - sector_t zone_size_sects; - spinlock_t zone_lock; - unsigned long *zone_locks; - - unsigned long size; /* device size in MB */ - unsigned long completion_nsec; /* time in ns to complete a request */ - unsigned long cache_size; /* disk cache size in MB */ - unsigned long zone_size; /* zone size in MB if device is zoned */ - unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ - unsigned int zone_nr_conv; /* number of conventional zones */ - unsigned int zone_max_open; /* max number of open zones */ - unsigned int zone_max_active; /* max number of active zones */ - unsigned int submit_queues; /* number of submission queues */ - unsigned int home_node; /* home node for the device */ - unsigned int queue_mode; /* block interface */ - unsigned int blocksize; /* block size */ - unsigned int irqmode; /* IRQ completion handler */ - unsigned int hw_queue_depth; /* queue depth */ - unsigned int index; /* index of the disk, only valid with a disk */ - unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ - bool blocking; /* blocking blk-mq device */ - bool use_per_node_hctx; /* use per-node allocation for hardware context */ - bool power; /* power on/off the device */ - bool memory_backed; /* if data is stored in memory */ - bool discard; /* if support discard */ - bool zoned; /* if device is zoned */ -}; - -struct nullb { - struct nullb_device *dev; - struct list_head list; - unsigned int index; - struct request_queue *q; - struct gendisk *disk; - struct blk_mq_tag_set *tag_set; - struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; - atomic_long_t cur_bytes; - struct hrtimer bw_timer; - unsigned long cache_flush_pos; - spinlock_t lock; - - struct nullb_queue *queues; - unsigned int nr_queues; - char disk_name[DISK_NAME_LEN]; -}; - -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors); - -#ifdef CONFIG_BLK_DEV_ZONED -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); -int null_register_zoned_dev(struct nullb *nullb); -void null_free_zoned_dev(struct nullb_device *dev); -int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors); -size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, unsigned int len); -#else -static inline int null_init_zoned_dev(struct nullb_device *dev, - struct request_queue *q) -{ - pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); - return -EINVAL; -} -static inline int null_register_zoned_dev(struct nullb *nullb) -{ - return -ENODEV; -} -static inline void null_free_zoned_dev(struct nullb_device *dev) {} -static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, sector_t nr_sectors) -{ - return BLK_STS_NOTSUPP; -} -static inline size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, - unsigned int len) -{ - return len; -} -#define null_report_zones NULL -#endif /* CONFIG_BLK_DEV_ZONED */ -#endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig new file mode 100644 index 000000000000..6bf1f8ca20a2 --- /dev/null +++ b/drivers/block/null_blk/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Null block device driver configuration +# + +config BLK_DEV_NULL_BLK + tristate "Null test block driver" + select CONFIGFS_FS + +config BLK_DEV_NULL_BLK_FAULT_INJECTION + bool "Support fault injection for Null test block driver" + depends on BLK_DEV_NULL_BLK && FAULT_INJECTION diff --git a/drivers/block/null_blk/Makefile b/drivers/block/null_blk/Makefile new file mode 100644 index 000000000000..84c36e512ab8 --- /dev/null +++ b/drivers/block/null_blk/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +# needed for trace events +ccflags-y += -I$(src) + +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +null_blk-objs := main.o +ifeq ($(CONFIG_BLK_DEV_ZONED), y) +null_blk-$(CONFIG_TRACING) += trace.o +endif +null_blk-$(CONFIG_BLK_DEV_ZONED) += zoned.o diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c new file mode 100644 index 000000000000..25db095e943b --- /dev/null +++ b/drivers/block/null_blk/main.c @@ -0,0 +1,2036 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Add configfs and memory store: Kyungchan Koh <kkc6196@xxxxxx> and + * Shaohua Li <shli@xxxxxx> + */ +#include <linux/module.h> + +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/init.h> +#include "null_blk.h" + +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + +#define FREE_BATCH 16 + +#define TICKS_PER_SEC 50ULL +#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static DECLARE_FAULT_ATTR(null_timeout_attr); +static DECLARE_FAULT_ATTR(null_requeue_attr); +static DECLARE_FAULT_ATTR(null_init_hctx_attr); +#endif + +static inline u64 mb_per_tick(int mbps) +{ + return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); +} + +/* + * Status flags for nullb_device. + * + * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. + * UP: Device is currently on and visible in userspace. + * THROTTLED: Device is being throttled. + * CACHE: Device is using a write-back cache. + */ +enum nullb_device_flags { + NULLB_DEV_FL_CONFIGURED = 0, + NULLB_DEV_FL_UP = 1, + NULLB_DEV_FL_THROTTLED = 2, + NULLB_DEV_FL_CACHE = 3, +}; + +#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) +/* + * nullb_page is a page in memory for nullb devices. + * + * @page: The page holding the data. + * @bitmap: The bitmap represents which sector in the page has data. + * Each bit represents one block size. For example, sector 8 + * will use the 7th bit + * The highest 2 bits of bitmap are for special purpose. LOCK means the cache + * page is being flushing to storage. FREE means the cache page is freed and + * should be skipped from flushing to storage. Please see + * null_make_cache_space + */ +struct nullb_page { + struct page *page; + DECLARE_BITMAP(bitmap, MAP_SZ); +}; +#define NULLB_PAGE_LOCK (MAP_SZ - 1) +#define NULLB_PAGE_FREE (MAP_SZ - 2) + +static LIST_HEAD(nullb_list); +static struct mutex lock; +static int null_major; +static DEFINE_IDA(nullb_indexes); +static struct blk_mq_tag_set tag_set; + +enum { + NULL_IRQ_NONE = 0, + NULL_IRQ_SOFTIRQ = 1, + NULL_IRQ_TIMER = 2, +}; + +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + +static int g_no_sched; +module_param_named(no_sched, g_no_sched, int, 0444); +MODULE_PARM_DESC(no_sched, "No io scheduler"); + +static int g_submit_queues = 1; +module_param_named(submit_queues, g_submit_queues, int, 0444); +MODULE_PARM_DESC(submit_queues, "Number of submission queues"); + +static int g_home_node = NUMA_NO_NODE; +module_param_named(home_node, g_home_node, int, 0444); +MODULE_PARM_DESC(home_node, "Home node for the device"); + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +/* + * For more details about fault injection, please refer to + * Documentation/fault-injection/fault-injection.rst. + */ +static char g_timeout_str[80]; +module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); +MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); + +static char g_requeue_str[80]; +module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); +MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); + +static char g_init_hctx_str[80]; +module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); +MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); +#endif + +static int g_queue_mode = NULL_Q_MQ; + +static int null_param_store_val(const char *str, int *val, int min, int max) +{ + int ret, new_val; + + ret = kstrtoint(str, 10, &new_val); + if (ret) + return -EINVAL; + + if (new_val < min || new_val > max) + return -EINVAL; + + *val = new_val; + return 0; +} + +static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); +} + +static const struct kernel_param_ops null_queue_mode_param_ops = { + .set = null_set_queue_mode, + .get = param_get_int, +}; + +device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); +MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); + +static int g_gb = 250; +module_param_named(gb, g_gb, int, 0444); +MODULE_PARM_DESC(gb, "Size in GB"); + +static int g_bs = 512; +module_param_named(bs, g_bs, int, 0444); +MODULE_PARM_DESC(bs, "Block size (in bytes)"); + +static unsigned int nr_devices = 1; +module_param(nr_devices, uint, 0444); +MODULE_PARM_DESC(nr_devices, "Number of devices to register"); + +static bool g_blocking; +module_param_named(blocking, g_blocking, bool, 0444); +MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); + +static bool shared_tags; +module_param(shared_tags, bool, 0444); +MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); + +static bool g_shared_tag_bitmap; +module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); +MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); + +static int g_irqmode = NULL_IRQ_SOFTIRQ; + +static int null_set_irqmode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, + NULL_IRQ_TIMER); +} + +static const struct kernel_param_ops null_irqmode_param_ops = { + .set = null_set_irqmode, + .get = param_get_int, +}; + +device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); +MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); + +static unsigned long g_completion_nsec = 10000; +module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); +MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); + +static int g_hw_queue_depth = 64; +module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); + +static bool g_use_per_node_hctx; +module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); +MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); + +static bool g_zoned; +module_param_named(zoned, g_zoned, bool, S_IRUGO); +MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); + +static unsigned long g_zone_size = 256; +module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); + +static unsigned long g_zone_capacity; +module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); + +static unsigned int g_zone_nr_conv; +module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); +MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); + +static unsigned int g_zone_max_open; +module_param_named(zone_max_open, g_zone_max_open, uint, 0444); +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); + +static unsigned int g_zone_max_active; +module_param_named(zone_max_active, g_zone_max_active, uint, 0444); +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); + +static struct nullb_device *null_alloc_dev(void); +static void null_free_dev(struct nullb_device *dev); +static void null_del_dev(struct nullb *nullb); +static int null_add_dev(struct nullb_device *dev); +static void null_free_device_storage(struct nullb_device *dev, bool is_cache); + +static inline struct nullb_device *to_nullb_device(struct config_item *item) +{ + return item ? container_of(item, struct nullb_device, item) : NULL; +} + +static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%lu\n", val); +} + +static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static ssize_t nullb_device_uint_attr_store(unsigned int *val, + const char *page, size_t count) +{ + unsigned int tmp; + int result; + + result = kstrtouint(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_ulong_attr_store(unsigned long *val, + const char *page, size_t count) +{ + int result; + unsigned long tmp; + + result = kstrtoul(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, + size_t count) +{ + bool tmp; + int result; + + result = kstrtobool(page, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ +#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ +static ssize_t \ +nullb_device_##NAME##_show(struct config_item *item, char *page) \ +{ \ + return nullb_device_##TYPE##_attr_show( \ + to_nullb_device(item)->NAME, page); \ +} \ +static ssize_t \ +nullb_device_##NAME##_store(struct config_item *item, const char *page, \ + size_t count) \ +{ \ + int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ + struct nullb_device *dev = to_nullb_device(item); \ + TYPE new_value = 0; \ + int ret; \ + \ + ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ + if (ret < 0) \ + return ret; \ + if (apply_fn) \ + ret = apply_fn(dev, new_value); \ + else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ + ret = -EBUSY; \ + if (ret < 0) \ + return ret; \ + dev->NAME = new_value; \ + return count; \ +} \ +CONFIGFS_ATTR(nullb_device_, NAME); + +static int nullb_apply_submit_queues(struct nullb_device *dev, + unsigned int submit_queues) +{ + struct nullb *nullb = dev->nullb; + struct blk_mq_tag_set *set; + + if (!nullb) + return 0; + + /* + * Make sure that null_init_hctx() does not access nullb->queues[] past + * the end of that array. + */ + if (submit_queues > nr_cpu_ids) + return -EINVAL; + set = nullb->tag_set; + blk_mq_update_nr_hw_queues(set, submit_queues); + return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; +} + +NULLB_DEVICE_ATTR(size, ulong, NULL); +NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); +NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); +NULLB_DEVICE_ATTR(home_node, uint, NULL); +NULLB_DEVICE_ATTR(queue_mode, uint, NULL); +NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(irqmode, uint, NULL); +NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); +NULLB_DEVICE_ATTR(index, uint, NULL); +NULLB_DEVICE_ATTR(blocking, bool, NULL); +NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); +NULLB_DEVICE_ATTR(memory_backed, bool, NULL); +NULLB_DEVICE_ATTR(discard, bool, NULL); +NULLB_DEVICE_ATTR(mbps, uint, NULL); +NULLB_DEVICE_ATTR(cache_size, ulong, NULL); +NULLB_DEVICE_ATTR(zoned, bool, NULL); +NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); +NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); + +static ssize_t nullb_device_power_show(struct config_item *item, char *page) +{ + return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); +} + +static ssize_t nullb_device_power_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *dev = to_nullb_device(item); + bool newp = false; + ssize_t ret; + + ret = nullb_device_bool_attr_store(&newp, page, count); + if (ret < 0) + return ret; + + if (!dev->power && newp) { + if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) + return count; + if (null_add_dev(dev)) { + clear_bit(NULLB_DEV_FL_UP, &dev->flags); + return -ENOMEM; + } + + set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + dev->power = newp; + } else if (dev->power && !newp) { + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = newp; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + } + + return count; +} + +CONFIGFS_ATTR(nullb_device_, power); + +static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) +{ + struct nullb_device *t_dev = to_nullb_device(item); + + return badblocks_show(&t_dev->badblocks, page, 0); +} + +static ssize_t nullb_device_badblocks_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *t_dev = to_nullb_device(item); + char *orig, *buf, *tmp; + u64 start, end; + int ret; + + orig = kstrndup(page, count, GFP_KERNEL); + if (!orig) + return -ENOMEM; + + buf = strstrip(orig); + + ret = -EINVAL; + if (buf[0] != '+' && buf[0] != '-') + goto out; + tmp = strchr(&buf[1], '-'); + if (!tmp) + goto out; + *tmp = '\0'; + ret = kstrtoull(buf + 1, 0, &start); + if (ret) + goto out; + ret = kstrtoull(tmp + 1, 0, &end); + if (ret) + goto out; + ret = -EINVAL; + if (start > end) + goto out; + /* enable badblocks */ + cmpxchg(&t_dev->badblocks.shift, -1, 0); + if (buf[0] == '+') + ret = badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1); + else + ret = badblocks_clear(&t_dev->badblocks, start, + end - start + 1); + if (ret == 0) + ret = count; +out: + kfree(orig); + return ret; +} +CONFIGFS_ATTR(nullb_device_, badblocks); + +static struct configfs_attribute *nullb_device_attrs[] = { + &nullb_device_attr_size, + &nullb_device_attr_completion_nsec, + &nullb_device_attr_submit_queues, + &nullb_device_attr_home_node, + &nullb_device_attr_queue_mode, + &nullb_device_attr_blocksize, + &nullb_device_attr_irqmode, + &nullb_device_attr_hw_queue_depth, + &nullb_device_attr_index, + &nullb_device_attr_blocking, + &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_power, + &nullb_device_attr_memory_backed, + &nullb_device_attr_discard, + &nullb_device_attr_mbps, + &nullb_device_attr_cache_size, + &nullb_device_attr_badblocks, + &nullb_device_attr_zoned, + &nullb_device_attr_zone_size, + &nullb_device_attr_zone_capacity, + &nullb_device_attr_zone_nr_conv, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_max_active, + NULL, +}; + +static void nullb_device_release(struct config_item *item) +{ + struct nullb_device *dev = to_nullb_device(item); + + null_free_device_storage(dev, false); + null_free_dev(dev); +} + +static struct configfs_item_operations nullb_device_ops = { + .release = nullb_device_release, +}; + +static const struct config_item_type nullb_device_type = { + .ct_item_ops = &nullb_device_ops, + .ct_attrs = nullb_device_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct +config_item *nullb_group_make_item(struct config_group *group, const char *name) +{ + struct nullb_device *dev; + + dev = null_alloc_dev(); + if (!dev) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&dev->item, name, &nullb_device_type); + + return &dev->item; +} + +static void +nullb_group_drop_item(struct config_group *group, struct config_item *item) +{ + struct nullb_device *dev = to_nullb_device(item); + + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = false; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + + config_item_put(item); +} + +static ssize_t memb_group_features_show(struct config_item *item, char *page) +{ + return snprintf(page, PAGE_SIZE, + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); +} + +CONFIGFS_ATTR_RO(memb_group_, features); + +static struct configfs_attribute *nullb_group_attrs[] = { + &memb_group_attr_features, + NULL, +}; + +static struct configfs_group_operations nullb_group_ops = { + .make_item = nullb_group_make_item, + .drop_item = nullb_group_drop_item, +}; + +static const struct config_item_type nullb_group_type = { + .ct_group_ops = &nullb_group_ops, + .ct_attrs = nullb_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nullb_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "nullb", + .ci_type = &nullb_group_type, + }, + }, +}; + +static inline int null_cache_active(struct nullb *nullb) +{ + return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); +} + +static struct nullb_device *null_alloc_dev(void) +{ + struct nullb_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); + INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); + if (badblocks_init(&dev->badblocks, 0)) { + kfree(dev); + return NULL; + } + + dev->size = g_gb * 1024; + dev->completion_nsec = g_completion_nsec; + dev->submit_queues = g_submit_queues; + dev->home_node = g_home_node; + dev->queue_mode = g_queue_mode; + dev->blocksize = g_bs; + dev->irqmode = g_irqmode; + dev->hw_queue_depth = g_hw_queue_depth; + dev->blocking = g_blocking; + dev->use_per_node_hctx = g_use_per_node_hctx; + dev->zoned = g_zoned; + dev->zone_size = g_zone_size; + dev->zone_capacity = g_zone_capacity; + dev->zone_nr_conv = g_zone_nr_conv; + dev->zone_max_open = g_zone_max_open; + dev->zone_max_active = g_zone_max_active; + return dev; +} + +static void null_free_dev(struct nullb_device *dev) +{ + if (!dev) + return; + + null_free_zoned_dev(dev); + badblocks_exit(&dev->badblocks); + kfree(dev); +} + +static void put_tag(struct nullb_queue *nq, unsigned int tag) +{ + clear_bit_unlock(tag, nq->tag_map); + + if (waitqueue_active(&nq->wait)) + wake_up(&nq->wait); +} + +static unsigned int get_tag(struct nullb_queue *nq) +{ + unsigned int tag; + + do { + tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); + if (tag >= nq->queue_depth) + return -1U; + } while (test_and_set_bit_lock(tag, nq->tag_map)); + + return tag; +} + +static void free_cmd(struct nullb_cmd *cmd) +{ + put_tag(cmd->nq, cmd->tag); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); + +static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + unsigned int tag; + + tag = get_tag(nq); + if (tag != -1U) { + cmd = &nq->cmds[tag]; + cmd->tag = tag; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } + return cmd; + } + + return NULL; +} + +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +{ + struct nullb_cmd *cmd; + DEFINE_WAIT(wait); + + cmd = __alloc_cmd(nq); + if (cmd || !can_wait) + return cmd; + + do { + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + cmd = __alloc_cmd(nq); + if (cmd) + break; + + io_schedule(); + } while (1); + + finish_wait(&nq->wait, &wait); + return cmd; +} + +static void end_cmd(struct nullb_cmd *cmd) +{ + int queue_mode = cmd->nq->dev->queue_mode; + + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_end_request(cmd->rq, cmd->error); + return; + case NULL_Q_BIO: + cmd->bio->bi_status = cmd->error; + bio_endio(cmd->bio); + break; + } + + free_cmd(cmd); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +{ + end_cmd(container_of(timer, struct nullb_cmd, timer)); + + return HRTIMER_NORESTART; +} + +static void null_cmd_end_timer(struct nullb_cmd *cmd) +{ + ktime_t kt = cmd->nq->dev->completion_nsec; + + hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); +} + +static void null_complete_rq(struct request *rq) +{ + end_cmd(blk_mq_rq_to_pdu(rq)); +} + +static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +{ + struct nullb_page *t_page; + + t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + if (!t_page) + goto out; + + t_page->page = alloc_pages(gfp_flags, 0); + if (!t_page->page) + goto out_freepage; + + memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); + return t_page; +out_freepage: + kfree(t_page); +out: + return NULL; +} + +static void null_free_page(struct nullb_page *t_page) +{ + __set_bit(NULLB_PAGE_FREE, t_page->bitmap); + if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) + return; + __free_page(t_page->page); + kfree(t_page); +} + +static bool null_page_empty(struct nullb_page *page) +{ + int size = MAP_SZ - 2; + + return find_first_bit(page->bitmap, size) == size; +} + +static void null_free_sector(struct nullb *nullb, sector_t sector, + bool is_cache) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page, *ret; + struct radix_tree_root *root; + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(root, idx); + if (t_page) { + __clear_bit(sector_bit, t_page->bitmap); + + if (null_page_empty(t_page)) { + ret = radix_tree_delete_item(root, idx, t_page); + WARN_ON(ret != t_page); + null_free_page(ret); + if (is_cache) + nullb->dev->curr_cache -= PAGE_SIZE; + } + } +} + +static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, + struct nullb_page *t_page, bool is_cache) +{ + struct radix_tree_root *root; + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + + if (radix_tree_insert(root, idx, t_page)) { + null_free_page(t_page); + t_page = radix_tree_lookup(root, idx); + WARN_ON(!t_page || t_page->page->index != idx); + } else if (is_cache) + nullb->dev->curr_cache += PAGE_SIZE; + + return t_page; +} + +static void null_free_device_storage(struct nullb_device *dev, bool is_cache) +{ + unsigned long pos = 0; + int nr_pages; + struct nullb_page *ret, *t_pages[FREE_BATCH]; + struct radix_tree_root *root; + + root = is_cache ? &dev->cache : &dev->data; + + do { + int i; + + nr_pages = radix_tree_gang_lookup(root, + (void **)t_pages, pos, FREE_BATCH); + + for (i = 0; i < nr_pages; i++) { + pos = t_pages[i]->page->index; + ret = radix_tree_delete_item(root, pos, t_pages[i]); + WARN_ON(ret != t_pages[i]); + null_free_page(ret); + } + + pos++; + } while (nr_pages == FREE_BATCH); + + if (is_cache) + dev->curr_cache = 0; +} + +static struct nullb_page *__null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool is_cache) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page; + struct radix_tree_root *root; + + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + t_page = radix_tree_lookup(root, idx); + WARN_ON(t_page && t_page->page->index != idx); + + if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) + return t_page; + + return NULL; +} + +static struct nullb_page *null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool ignore_cache) +{ + struct nullb_page *page = NULL; + + if (!ignore_cache) + page = __null_lookup_page(nullb, sector, for_write, true); + if (page) + return page; + return __null_lookup_page(nullb, sector, for_write, false); +} + +static struct nullb_page *null_insert_page(struct nullb *nullb, + sector_t sector, bool ignore_cache) + __releases(&nullb->lock) + __acquires(&nullb->lock) +{ + u64 idx; + struct nullb_page *t_page; + + t_page = null_lookup_page(nullb, sector, true, ignore_cache); + if (t_page) + return t_page; + + spin_unlock_irq(&nullb->lock); + + t_page = null_alloc_page(GFP_NOIO); + if (!t_page) + goto out_lock; + + if (radix_tree_preload(GFP_NOIO)) + goto out_freepage; + + spin_lock_irq(&nullb->lock); + idx = sector >> PAGE_SECTORS_SHIFT; + t_page->page->index = idx; + t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); + radix_tree_preload_end(); + + return t_page; +out_freepage: + null_free_page(t_page); +out_lock: + spin_lock_irq(&nullb->lock); + return null_lookup_page(nullb, sector, true, ignore_cache); +} + +static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) +{ + int i; + unsigned int offset; + u64 idx; + struct nullb_page *t_page, *ret; + void *dst, *src; + + idx = c_page->page->index; + + t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); + + __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); + if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { + null_free_page(c_page); + if (t_page && null_page_empty(t_page)) { + ret = radix_tree_delete_item(&nullb->dev->data, + idx, t_page); + null_free_page(t_page); + } + return 0; + } + + if (!t_page) + return -ENOMEM; + + src = kmap_atomic(c_page->page); + dst = kmap_atomic(t_page->page); + + for (i = 0; i < PAGE_SECTORS; + i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { + if (test_bit(i, c_page->bitmap)) { + offset = (i << SECTOR_SHIFT); + memcpy(dst + offset, src + offset, + nullb->dev->blocksize); + __set_bit(i, t_page->bitmap); + } + } + + kunmap_atomic(dst); + kunmap_atomic(src); + + ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); + null_free_page(ret); + nullb->dev->curr_cache -= PAGE_SIZE; + + return 0; +} + +static int null_make_cache_space(struct nullb *nullb, unsigned long n) +{ + int i, err, nr_pages; + struct nullb_page *c_pages[FREE_BATCH]; + unsigned long flushed = 0, one_round; + +again: + if ((nullb->dev->cache_size * 1024 * 1024) > + nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) + return 0; + + nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, + (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); + /* + * nullb_flush_cache_page could unlock before using the c_pages. To + * avoid race, we don't allow page free + */ + for (i = 0; i < nr_pages; i++) { + nullb->cache_flush_pos = c_pages[i]->page->index; + /* + * We found the page which is being flushed to disk by other + * threads + */ + if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) + c_pages[i] = NULL; + else + __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); + } + + one_round = 0; + for (i = 0; i < nr_pages; i++) { + if (c_pages[i] == NULL) + continue; + err = null_flush_cache_page(nullb, c_pages[i]); + if (err) + return err; + one_round++; + } + flushed += one_round << PAGE_SHIFT; + + if (n > flushed) { + if (nr_pages == 0) + nullb->cache_flush_pos = 0; + if (one_round == 0) { + /* give other threads a chance */ + spin_unlock_irq(&nullb->lock); + spin_lock_irq(&nullb->lock); + } + goto again; + } + return 0; +} + +static int copy_to_nullb(struct nullb *nullb, struct page *source, + unsigned int off, sector_t sector, size_t n, bool is_fua) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + if (null_cache_active(nullb) && !is_fua) + null_make_cache_space(nullb, PAGE_SIZE); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_insert_page(nullb, sector, + !null_cache_active(nullb) || is_fua); + if (!t_page) + return -ENOSPC; + + src = kmap_atomic(source); + dst = kmap_atomic(t_page->page); + memcpy(dst + offset, src + off + count, temp); + kunmap_atomic(dst); + kunmap_atomic(src); + + __set_bit(sector & SECTOR_MASK, t_page->bitmap); + + if (is_fua) + null_free_sector(nullb, sector, true); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int copy_from_nullb(struct nullb *nullb, struct page *dest, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_lookup_page(nullb, sector, false, + !null_cache_active(nullb)); + + dst = kmap_atomic(dest); + if (!t_page) { + memset(dst + off + count, 0, temp); + goto next; + } + src = kmap_atomic(t_page->page); + memcpy(dst + off + count, src + offset, temp); + kunmap_atomic(src); +next: + kunmap_atomic(dst); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static void nullb_fill_pattern(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off) +{ + void *dst; + + dst = kmap_atomic(page); + memset(dst + off, 0xFF, len); + kunmap_atomic(dst); +} + +static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) +{ + size_t temp; + + spin_lock_irq(&nullb->lock); + while (n > 0) { + temp = min_t(size_t, n, nullb->dev->blocksize); + null_free_sector(nullb, sector, false); + if (null_cache_active(nullb)) + null_free_sector(nullb, sector, true); + sector += temp >> SECTOR_SHIFT; + n -= temp; + } + spin_unlock_irq(&nullb->lock); +} + +static int null_handle_flush(struct nullb *nullb) +{ + int err; + + if (!null_cache_active(nullb)) + return 0; + + spin_lock_irq(&nullb->lock); + while (true) { + err = null_make_cache_space(nullb, + nullb->dev->cache_size * 1024 * 1024); + if (err || nullb->dev->curr_cache == 0) + break; + } + + WARN_ON(!radix_tree_empty(&nullb->dev->cache)); + spin_unlock_irq(&nullb->lock); + return err; +} + +static int null_transfer(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off, bool is_write, sector_t sector, + bool is_fua) +{ + struct nullb_device *dev = nullb->dev; + unsigned int valid_len = len; + int err = 0; + + if (!is_write) { + if (dev->zoned) + valid_len = null_zone_valid_read_len(nullb, + sector, len); + + if (valid_len) { + err = copy_from_nullb(nullb, page, off, + sector, valid_len); + off += valid_len; + len -= valid_len; + } + + if (len) + nullb_fill_pattern(nullb, page, len, off); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + err = copy_to_nullb(nullb, page, off, sector, len, is_fua); + } + + return err; +} + +static int null_handle_rq(struct nullb_cmd *cmd) +{ + struct request *rq = cmd->rq; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector; + struct req_iterator iter; + struct bio_vec bvec; + + sector = blk_rq_pos(rq); + + if (req_op(rq) == REQ_OP_DISCARD) { + null_handle_discard(nullb, sector, blk_rq_bytes(rq)); + return 0; + } + + spin_lock_irq(&nullb->lock); + rq_for_each_segment(bvec, rq, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(req_op(rq)), sector, + rq->cmd_flags & REQ_FUA); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + + return 0; +} + +static int null_handle_bio(struct nullb_cmd *cmd) +{ + struct bio *bio = cmd->bio; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector; + struct bio_vec bvec; + struct bvec_iter iter; + + sector = bio->bi_iter.bi_sector; + + if (bio_op(bio) == REQ_OP_DISCARD) { + null_handle_discard(nullb, sector, + bio_sectors(bio) << SECTOR_SHIFT); + return 0; + } + + spin_lock_irq(&nullb->lock); + bio_for_each_segment(bvec, bio, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(bio_op(bio)), sector, + bio->bi_opf & REQ_FUA); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + return 0; +} + +static void null_stop_queue(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_stop_hw_queues(q); +} + +static void null_restart_queue_async(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_start_stopped_hw_queues(q, true); +} + +static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts = BLK_STS_OK; + struct request *rq = cmd->rq; + + if (!hrtimer_active(&nullb->bw_timer)) + hrtimer_restart(&nullb->bw_timer); + + if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { + null_stop_queue(nullb); + /* race with timer */ + if (atomic_long_read(&nullb->cur_bytes) > 0) + null_restart_queue_async(nullb); + /* requeue request */ + sts = BLK_STS_DEV_RESOURCE; + } + return sts; +} + +static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, + sector_t sector, + sector_t nr_sectors) +{ + struct badblocks *bb = &cmd->nq->dev->badblocks; + sector_t first_bad; + int bad_sectors; + + if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_IOERR; + + return BLK_STS_OK; +} + +static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, + enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + int err; + + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + + return errno_to_blk_status(err); +} + +static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct bio *bio; + + if (dev->memory_backed) + return; + + if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { + zero_fill_bio(cmd->bio); + } else if (req_op(cmd->rq) == REQ_OP_READ) { + __rq_for_each_bio(bio, cmd->rq) + zero_fill_bio(bio); + } +} + +static inline void nullb_complete_cmd(struct nullb_cmd *cmd) +{ + /* + * Since root privileges are required to configure the null_blk + * driver, it is fine that this driver does not initialize the + * data buffers of read commands. Zero-initialize these buffers + * anyway if KMSAN is enabled to prevent that KMSAN complains + * about null_blk not initializing read data buffers. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + nullb_zero_read_cmd_buffer(cmd); + + /* Complete IO by inline, softirq or timer */ + switch (cmd->nq->dev->irqmode) { + case NULL_IRQ_SOFTIRQ: + switch (cmd->nq->dev->queue_mode) { + case NULL_Q_MQ: + blk_mq_complete_request(cmd->rq); + break; + case NULL_Q_BIO: + /* + * XXX: no proper submitting cpu information available. + */ + end_cmd(cmd); + break; + } + break; + case NULL_IRQ_NONE: + end_cmd(cmd); + break; + case NULL_IRQ_TIMER: + null_cmd_end_timer(cmd); + break; + } +} + +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + blk_status_t ret; + + if (dev->badblocks.shift != -1) { + ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (ret != BLK_STS_OK) + return ret; + } + + if (dev->memory_backed) + return null_handle_memory_backed(cmd, op); + + return BLK_STS_OK; +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts; + + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { + sts = null_handle_throttled(cmd); + if (sts != BLK_STS_OK) + return sts; + } + + if (op == REQ_OP_FLUSH) { + cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + goto out; + } + + if (dev->zoned) + sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); + else + sts = null_process_cmd(cmd, op, sector, nr_sectors); + + /* Do not overwrite errors (e.g. timeout errors) */ + if (cmd->error == BLK_STS_OK) + cmd->error = sts; + +out: + nullb_complete_cmd(cmd); + return BLK_STS_OK; +} + +static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) +{ + struct nullb *nullb = container_of(timer, struct nullb, bw_timer); + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + unsigned int mbps = nullb->dev->mbps; + + if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) + return HRTIMER_NORESTART; + + atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); + null_restart_queue_async(nullb); + + hrtimer_forward_now(&nullb->bw_timer, timer_interval); + + return HRTIMER_RESTART; +} + +static void nullb_setup_bwtimer(struct nullb *nullb) +{ + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + + hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + nullb->bw_timer.function = nullb_bwtimer_fn; + atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); + hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); +} + +static struct nullb_queue *nullb_to_queue(struct nullb *nullb) +{ + int index = 0; + + if (nullb->nr_queues != 1) + index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); + + return &nullb->queues[index]; +} + +static blk_qc_t null_submit_bio(struct bio *bio) +{ + sector_t sector = bio->bi_iter.bi_sector; + sector_t nr_sectors = bio_sectors(bio); + struct nullb *nullb = bio->bi_disk->private_data; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 1); + cmd->bio = bio; + + null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); + return BLK_QC_T_NONE; +} + +static bool should_timeout_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_timeout_str[0]) + return should_fail(&null_timeout_attr, 1); +#endif + return false; +} + +static bool should_requeue_request(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_requeue_str[0]) + return should_fail(&null_requeue_attr, 1); +#endif + return false; +} + +static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + + pr_info("rq %p timed out\n", rq); + + /* + * If the device is marked as blocking (i.e. memory backed or zoned + * device), the submission path may be blocked waiting for resources + * and cause real timeouts. For these real timeouts, the submission + * path will complete the request using blk_mq_complete_request(). + * Only fake timeouts need to execute blk_mq_complete_request() here. + */ + cmd->error = BLK_STS_TIMEOUT; + if (cmd->fake_timeout) + blk_mq_complete_request(rq); + return BLK_EH_DONE; +} + +static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + struct nullb_queue *nq = hctx->driver_data; + sector_t nr_sectors = blk_rq_sectors(bd->rq); + sector_t sector = blk_rq_pos(bd->rq); + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + if (nq->dev->irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } + cmd->rq = bd->rq; + cmd->error = BLK_STS_OK; + cmd->nq = nq; + cmd->fake_timeout = should_timeout_request(bd->rq) || + blk_should_fake_timeout(bd->rq->q); + + blk_mq_start_request(bd->rq); + + if (should_requeue_request(bd->rq)) { + /* + * Alternate between hitting the core BUSY path, and the + * driver driven requeue path + */ + nq->requeue_selection++; + if (nq->requeue_selection & 1) + return BLK_STS_RESOURCE; + else { + blk_mq_requeue_request(bd->rq, true); + return BLK_STS_OK; + } + } + if (cmd->fake_timeout) + return BLK_STS_OK; + + return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); +} + +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + +static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nullb_queue *nq = hctx->driver_data; + struct nullb *nullb = nq->dev->nullb; + + nullb->nr_queues--; +} + +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; + nq->dev = nullb->dev; +} + +static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, + unsigned int hctx_idx) +{ + struct nullb *nullb = hctx->queue->queuedata; + struct nullb_queue *nq; + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) + return -EFAULT; +#endif + + nq = &nullb->queues[hctx_idx]; + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + + return 0; +} + +static const struct blk_mq_ops null_mq_ops = { + .queue_rq = null_queue_rq, + .complete = null_complete_rq, + .timeout = null_timeout_rq, + .init_hctx = null_init_hctx, + .exit_hctx = null_exit_hctx, +}; + +static void null_del_dev(struct nullb *nullb) +{ + struct nullb_device *dev; + + if (!nullb) + return; + + dev = nullb->dev; + + ida_simple_remove(&nullb_indexes, nullb->index); + + list_del_init(&nullb->list); + + del_gendisk(nullb->disk); + + if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { + hrtimer_cancel(&nullb->bw_timer); + atomic_long_set(&nullb->cur_bytes, LONG_MAX); + null_restart_queue_async(nullb); + } + + blk_cleanup_queue(nullb->q); + if (dev->queue_mode == NULL_Q_MQ && + nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); + put_disk(nullb->disk); + cleanup_queues(nullb); + if (null_cache_active(nullb)) + null_free_device_storage(nullb->dev, true); + kfree(nullb); + dev->nullb = NULL; +} + +static void null_config_discard(struct nullb *nullb) +{ + if (nullb->dev->discard == false) + return; + + if (nullb->dev->zoned) { + nullb->dev->discard = false; + pr_info("discard option is ignored in zoned mode\n"); + return; + } + + nullb->q->limits.discard_granularity = nullb->dev->blocksize; + nullb->q->limits.discard_alignment = nullb->dev->blocksize; + blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); +} + +static const struct block_device_operations null_bio_ops = { + .owner = THIS_MODULE, + .submit_bio = null_submit_bio, + .report_zones = null_report_zones, +}; + +static const struct block_device_operations null_rq_ops = { + .owner = THIS_MODULE, + .report_zones = null_report_zones, +}; + +static int setup_commands(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + int i, tag_size; + + nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); + if (!nq->cmds) + return -ENOMEM; + + tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; + nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); + if (!nq->tag_map) { + kfree(nq->cmds); + return -ENOMEM; + } + + for (i = 0; i < nq->queue_depth; i++) { + cmd = &nq->cmds[i]; + cmd->tag = -1U; + } + + return 0; +} + +static int setup_queues(struct nullb *nullb) +{ + nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), + GFP_KERNEL); + if (!nullb->queues) + return -ENOMEM; + + nullb->queue_depth = nullb->dev->hw_queue_depth; + + return 0; +} + +static int init_driver_queues(struct nullb *nullb) +{ + struct nullb_queue *nq; + int i, ret = 0; + + for (i = 0; i < nullb->dev->submit_queues; i++) { + nq = &nullb->queues[i]; + + null_init_queue(nullb, nq); + + ret = setup_commands(nq); + if (ret) + return ret; + nullb->nr_queues++; + } + return 0; +} + +static int null_gendisk_register(struct nullb *nullb) +{ + sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; + struct gendisk *disk; + + disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); + if (!disk) + return -ENOMEM; + set_capacity(disk, size); + + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->major = null_major; + disk->first_minor = nullb->index; + if (queue_is_mq(nullb->q)) + disk->fops = &null_rq_ops; + else + disk->fops = &null_bio_ops; + disk->private_data = nullb; + disk->queue = nullb->q; + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + if (nullb->dev->zoned) { + int ret = null_register_zoned_dev(nullb); + + if (ret) + return ret; + } + + add_disk(disk); + return 0; +} + +static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) +{ + set->ops = &null_mq_ops; + set->nr_hw_queues = nullb ? nullb->dev->submit_queues : + g_submit_queues; + set->queue_depth = nullb ? nullb->dev->hw_queue_depth : + g_hw_queue_depth; + set->numa_node = nullb ? nullb->dev->home_node : g_home_node; + set->cmd_size = sizeof(struct nullb_cmd); + set->flags = BLK_MQ_F_SHOULD_MERGE; + if (g_no_sched) + set->flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + set->driver_data = NULL; + + if ((nullb && nullb->dev->blocking) || g_blocking) + set->flags |= BLK_MQ_F_BLOCKING; + + return blk_mq_alloc_tag_set(set); +} + +static int null_validate_conf(struct nullb_device *dev) +{ + dev->blocksize = round_down(dev->blocksize, 512); + dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); + + if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { + if (dev->submit_queues != nr_online_nodes) + dev->submit_queues = nr_online_nodes; + } else if (dev->submit_queues > nr_cpu_ids) + dev->submit_queues = nr_cpu_ids; + else if (dev->submit_queues == 0) + dev->submit_queues = 1; + + dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); + dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); + + /* Do memory allocation, so set blocking */ + if (dev->memory_backed) + dev->blocking = true; + else /* cache is meaningless */ + dev->cache_size = 0; + dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, + dev->cache_size); + dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); + /* can not stop a queue */ + if (dev->queue_mode == NULL_Q_BIO) + dev->mbps = 0; + + if (dev->zoned && + (!dev->zone_size || !is_power_of_2(dev->zone_size))) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + + return 0; +} + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION +static bool __null_setup_fault(struct fault_attr *attr, char *str) +{ + if (!str[0]) + return true; + + if (!setup_fault_attr(attr, str)) + return false; + + attr->verbose = 0; + return true; +} +#endif + +static bool null_setup_fault(void) +{ +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) + return false; + if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) + return false; + if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) + return false; +#endif + return true; +} + +static int null_add_dev(struct nullb_device *dev) +{ + struct nullb *nullb; + int rv; + + rv = null_validate_conf(dev); + if (rv) + return rv; + + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); + if (!nullb) { + rv = -ENOMEM; + goto out; + } + nullb->dev = dev; + dev->nullb = nullb; + + spin_lock_init(&nullb->lock); + + rv = setup_queues(nullb); + if (rv) + goto out_free_nullb; + + if (dev->queue_mode == NULL_Q_MQ) { + if (shared_tags) { + nullb->tag_set = &tag_set; + rv = 0; + } else { + nullb->tag_set = &nullb->__tag_set; + rv = null_init_tag_set(nullb, nullb->tag_set); + } + + if (rv) + goto out_cleanup_queues; + + if (!null_setup_fault()) + goto out_cleanup_queues; + + nullb->tag_set->timeout = 5 * HZ; + nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); + if (IS_ERR(nullb->q)) { + rv = -ENOMEM; + goto out_cleanup_tags; + } + } else if (dev->queue_mode == NULL_Q_BIO) { + nullb->q = blk_alloc_queue(dev->home_node); + if (!nullb->q) { + rv = -ENOMEM; + goto out_cleanup_queues; + } + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; + } + + if (dev->mbps) { + set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); + nullb_setup_bwtimer(nullb); + } + + if (dev->cache_size > 0) { + set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); + blk_queue_write_cache(nullb->q, true, true); + } + + if (dev->zoned) { + rv = null_init_zoned_dev(dev, nullb->q); + if (rv) + goto out_cleanup_blk_queue; + } + + nullb->q->queuedata = nullb; + blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); + + mutex_lock(&lock); + rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + if (rv < 0) { + mutex_unlock(&lock); + goto out_cleanup_zone; + } + nullb->index = rv; + dev->index = rv; + mutex_unlock(&lock); + + blk_queue_logical_block_size(nullb->q, dev->blocksize); + blk_queue_physical_block_size(nullb->q, dev->blocksize); + + null_config_discard(nullb); + + sprintf(nullb->disk_name, "nullb%d", nullb->index); + + rv = null_gendisk_register(nullb); + if (rv) + goto out_ida_free; + + mutex_lock(&lock); + list_add_tail(&nullb->list, &nullb_list); + mutex_unlock(&lock); + + return 0; + +out_ida_free: + ida_free(&nullb_indexes, nullb->index); +out_cleanup_zone: + null_free_zoned_dev(dev); +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + blk_mq_free_tag_set(nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); + dev->nullb = NULL; +out: + return rv; +} + +static int __init null_init(void) +{ + int ret = 0; + unsigned int i; + struct nullb *nullb; + struct nullb_device *dev; + + if (g_bs > PAGE_SIZE) { + pr_warn("invalid block size\n"); + pr_warn("defaults block size to %lu\n", PAGE_SIZE); + g_bs = PAGE_SIZE; + } + + if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { + pr_err("invalid home_node value\n"); + g_home_node = NUMA_NO_NODE; + } + + if (g_queue_mode == NULL_Q_RQ) { + pr_err("legacy IO path no longer available\n"); + return -EINVAL; + } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { + if (g_submit_queues != nr_online_nodes) { + pr_warn("submit_queues param is set to %u.\n", + nr_online_nodes); + g_submit_queues = nr_online_nodes; + } + } else if (g_submit_queues > nr_cpu_ids) + g_submit_queues = nr_cpu_ids; + else if (g_submit_queues <= 0) + g_submit_queues = 1; + + if (g_queue_mode == NULL_Q_MQ && shared_tags) { + ret = null_init_tag_set(NULL, &tag_set); + if (ret) + return ret; + } + + config_group_init(&nullb_subsys.su_group); + mutex_init(&nullb_subsys.su_mutex); + + ret = configfs_register_subsystem(&nullb_subsys); + if (ret) + goto err_tagset; + + mutex_init(&lock); + + null_major = register_blkdev(0, "nullb"); + if (null_major < 0) { + ret = null_major; + goto err_conf; + } + + for (i = 0; i < nr_devices; i++) { + dev = null_alloc_dev(); + if (!dev) { + ret = -ENOMEM; + goto err_dev; + } + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); + goto err_dev; + } + } + + pr_info("module loaded\n"); + return 0; + +err_dev: + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; + null_del_dev(nullb); + null_free_dev(dev); + } + unregister_blkdev(null_major, "nullb"); +err_conf: + configfs_unregister_subsystem(&nullb_subsys); +err_tagset: + if (g_queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); + return ret; +} + +static void __exit null_exit(void) +{ + struct nullb *nullb; + + configfs_unregister_subsystem(&nullb_subsys); + + unregister_blkdev(null_major, "nullb"); + + mutex_lock(&lock); + while (!list_empty(&nullb_list)) { + struct nullb_device *dev; + + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; + null_del_dev(nullb); + null_free_dev(dev); + } + mutex_unlock(&lock); + + if (g_queue_mode == NULL_Q_MQ && shared_tags) + blk_mq_free_tag_set(&tag_set); +} + +module_init(null_init); +module_exit(null_exit); + +MODULE_AUTHOR("Jens Axboe <axboe@xxxxxxxxx>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h new file mode 100644 index 000000000000..7de703f28617 --- /dev/null +++ b/drivers/block/null_blk/null_blk.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BLK_NULL_BLK_H +#define __BLK_NULL_BLK_H + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/blk-mq.h> +#include <linux/hrtimer.h> +#include <linux/configfs.h> +#include <linux/badblocks.h> +#include <linux/fault-inject.h> + +struct nullb_cmd { + struct request *rq; + struct bio *bio; + unsigned int tag; + blk_status_t error; + struct nullb_queue *nq; + struct hrtimer timer; + bool fake_timeout; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + struct nullb_device *dev; + unsigned int requeue_selection; + + struct nullb_cmd *cmds; +}; + +struct nullb_device { + struct nullb *nullb; + struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ + struct radix_tree_root cache; /* disk cache data */ + unsigned long flags; /* device flags */ + unsigned int curr_cache; + struct badblocks badblocks; + + unsigned int nr_zones; + unsigned int nr_zones_imp_open; + unsigned int nr_zones_exp_open; + unsigned int nr_zones_closed; + struct blk_zone *zones; + sector_t zone_size_sects; + spinlock_t zone_lock; + unsigned long *zone_locks; + + unsigned long size; /* device size in MB */ + unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned long cache_size; /* disk cache size in MB */ + unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ + unsigned int zone_nr_conv; /* number of conventional zones */ + unsigned int zone_max_open; /* max number of open zones */ + unsigned int zone_max_active; /* max number of active zones */ + unsigned int submit_queues; /* number of submission queues */ + unsigned int home_node; /* home node for the device */ + unsigned int queue_mode; /* block interface */ + unsigned int blocksize; /* block size */ + unsigned int irqmode; /* IRQ completion handler */ + unsigned int hw_queue_depth; /* queue depth */ + unsigned int index; /* index of the disk, only valid with a disk */ + unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ + bool blocking; /* blocking blk-mq device */ + bool use_per_node_hctx; /* use per-node allocation for hardware context */ + bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ + bool discard; /* if support discard */ + bool zoned; /* if device is zoned */ +}; + +struct nullb { + struct nullb_device *dev; + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; + unsigned int queue_depth; + atomic_long_t cur_bytes; + struct hrtimer bw_timer; + unsigned long cache_flush_pos; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; +}; + +blk_status_t null_process_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + unsigned int nr_sectors); + +#ifdef CONFIG_BLK_DEV_ZONED +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +int null_register_zoned_dev(struct nullb *nullb); +void null_free_zoned_dev(struct nullb_device *dev); +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors); +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len); +#else +static inline int null_init_zoned_dev(struct nullb_device *dev, + struct request_queue *q) +{ + pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); + return -EINVAL; +} +static inline int null_register_zoned_dev(struct nullb *nullb) +{ + return -ENODEV; +} +static inline void null_free_zoned_dev(struct nullb_device *dev) {} +static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, sector_t nr_sectors) +{ + return BLK_STS_NOTSUPP; +} +static inline size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, + unsigned int len) +{ + return len; +} +#define null_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ +#endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk/trace.c b/drivers/block/null_blk/trace.c new file mode 100644 index 000000000000..3711cba16071 --- /dev/null +++ b/drivers/block/null_blk/trace.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * null_blk trace related helpers. + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ +#include "trace.h" + +/* + * Helper to use for all null_blk traces to extract disk name. + */ +const char *nullb_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (name && *name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h new file mode 100644 index 000000000000..ce3b430e88c5 --- /dev/null +++ b/drivers/block/null_blk/trace.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * null_blk device driver tracepoints. + * + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nullb + +#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NULLB_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "null_blk.h" + +const char *nullb_trace_disk_name(struct trace_seq *p, char *name); + +#define __print_disk_name(name) nullb_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + +TRACE_EVENT(nullb_zone_op, + TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, + unsigned int zone_cond), + TP_ARGS(cmd, zone_no, zone_cond), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(enum req_opf, op) + __field(unsigned int, zone_no) + __field(unsigned int, zone_cond) + ), + TP_fast_assign( + __entry->op = req_op(cmd->rq); + __entry->zone_no = zone_no; + __entry->zone_cond = zone_cond; + __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + ), + TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", + __print_disk_name(__entry->disk), + blk_op_str(__entry->op), + __entry->zone_no, + blk_zone_cond_str(__entry->zone_cond)) +); + +TRACE_EVENT(nullb_report_zones, + TP_PROTO(struct nullb *nullb, unsigned int nr_zones), + TP_ARGS(nullb, nr_zones), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(unsigned int, nr_zones) + ), + TP_fast_assign( + __entry->nr_zones = nr_zones; + __assign_disk_name(__entry->disk, nullb->disk); + ), + TP_printk("%s nr_zones=%u", + __print_disk_name(__entry->disk), __entry->nr_zones) +); + +#endif /* _TRACE_NULLB_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c new file mode 100644 index 000000000000..41220ce59659 --- /dev/null +++ b/drivers/block/null_blk/zoned.c @@ -0,0 +1,617 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/vmalloc.h> +#include <linux/bitmap.h> +#include "null_blk.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) + +static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) +{ + return sect >> ilog2(dev->zone_size_sects); +} + +int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) +{ + sector_t dev_capacity_sects, zone_capacity_sects; + sector_t sector = 0; + unsigned int i; + + if (!is_power_of_2(dev->zone_size)) { + pr_err("zone_size must be power-of-two\n"); + return -EINVAL; + } + if (dev->zone_size > dev->size) { + pr_err("Zone size larger than device capacity\n"); + return -EINVAL; + } + + if (!dev->zone_capacity) + dev->zone_capacity = dev->zone_size; + + if (dev->zone_capacity > dev->zone_size) { + pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); + return -EINVAL; + } + + zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), + GFP_KERNEL | __GFP_ZERO); + if (!dev->zones) + return -ENOMEM; + + /* + * With memory backing, the zone_lock spinlock needs to be temporarily + * released to avoid scheduling in atomic context. To guarantee zone + * information protection, use a bitmap to lock zones with + * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing + * implies that the queue is marked with BLK_MQ_F_BLOCKING. + */ + spin_lock_init(&dev->zone_lock); + if (dev->memory_backed) { + dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); + if (!dev->zone_locks) { + kvfree(dev->zones); + return -ENOMEM; + } + } + + if (dev->zone_nr_conv >= dev->nr_zones) { + dev->zone_nr_conv = dev->nr_zones - 1; + pr_info("changed the number of conventional zones to %u", + dev->zone_nr_conv); + } + + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ + if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_active = 0; + pr_info("zone_max_active limit disabled, limit >= zone count\n"); + } + + /* Max open zones has to be <= max active zones */ + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { + dev->zone_max_open = dev->zone_max_active; + pr_info("changed the maximum number of open zones to %u\n", + dev->nr_zones); + } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { + dev->zone_max_open = 0; + pr_info("zone_max_open limit disabled, limit >= zone count\n"); + } + + for (i = 0; i < dev->zone_nr_conv; i++) { + struct blk_zone *zone = &dev->zones[i]; + + zone->start = sector; + zone->len = dev->zone_size_sects; + zone->capacity = zone->len; + zone->wp = zone->start + zone->len; + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->cond = BLK_ZONE_COND_NOT_WP; + + sector += dev->zone_size_sects; + } + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + struct blk_zone *zone = &dev->zones[i]; + + zone->start = zone->wp = sector; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; + zone->capacity = + min_t(sector_t, zone->len, zone_capacity_sects); + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone->cond = BLK_ZONE_COND_EMPTY; + + sector += dev->zone_size_sects; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + + return 0; +} + +int null_register_zoned_dev(struct nullb *nullb) +{ + struct nullb_device *dev = nullb->dev; + struct request_queue *q = nullb->q; + + if (queue_is_mq(q)) { + int ret = blk_revalidate_disk_zones(nullb->disk, NULL); + + if (ret) + return ret; + } else { + blk_queue_chunk_sectors(q, dev->zone_size_sects); + q->nr_zones = blkdev_nr_zones(nullb->disk); + } + + blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); + blk_queue_max_open_zones(q, dev->zone_max_open); + blk_queue_max_active_zones(q, dev->zone_max_active); + + return 0; +} + +void null_free_zoned_dev(struct nullb_device *dev) +{ + bitmap_free(dev->zone_locks); + kvfree(dev->zones); + dev->zones = NULL; +} + +static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) +{ + if (dev->memory_backed) + wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); + spin_lock_irq(&dev->zone_lock); +} + +static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) +{ + spin_unlock_irq(&dev->zone_lock); + + if (dev->memory_backed) + clear_and_wake_up_bit(zno, dev->zone_locks); +} + +int null_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nullb *nullb = disk->private_data; + struct nullb_device *dev = nullb->dev; + unsigned int first_zone, i, zno; + struct blk_zone zone; + int error; + + first_zone = null_zone_no(dev, sector); + if (first_zone >= dev->nr_zones) + return 0; + + nr_zones = min(nr_zones, dev->nr_zones - first_zone); + trace_nullb_report_zones(nullb, nr_zones); + + zno = first_zone; + for (i = 0; i < nr_zones; i++, zno++) { + /* + * Stacked DM target drivers will remap the zone information by + * modifying the zone information passed to the report callback. + * So use a local copy to avoid corruption of the device zone + * array. + */ + null_lock_zone(dev, zno); + memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); + null_unlock_zone(dev, zno); + + error = cb(&zone, i, data); + if (error) + return error; + } + + return nr_zones; +} + +/* + * This is called in the case of memory backing from null_process_cmd() + * with the target zone already locked. + */ +size_t null_zone_valid_read_len(struct nullb *nullb, + sector_t sector, unsigned int len) +{ + struct nullb_device *dev = nullb->dev; + struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + unsigned int nr_sectors = len >> SECTOR_SHIFT; + + /* Read must be below the write pointer position */ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || + sector + nr_sectors <= zone->wp) + return len; + + if (sector > zone->wp) + return 0; + + return (zone->wp - sector) << SECTOR_SHIFT; +} + +static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } + + return BLK_STS_OK; +} + +static void null_close_first_imp_zone(struct nullb_device *dev) +{ + unsigned int i; + + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { + null_close_zone(dev, &dev->zones[i]); + return; + } + } +} + +static blk_status_t null_check_active(struct nullb_device *dev) +{ + if (!dev->zone_max_active) + return BLK_STS_OK; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + + dev->nr_zones_closed < dev->zone_max_active) + return BLK_STS_OK; + + return BLK_STS_ZONE_ACTIVE_RESOURCE; +} + +static blk_status_t null_check_open(struct nullb_device *dev) +{ + if (!dev->zone_max_open) + return BLK_STS_OK; + + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) + return BLK_STS_OK; + + if (dev->nr_zones_imp_open) { + if (null_check_active(dev) == BLK_STS_OK) { + null_close_first_imp_zone(dev); + return BLK_STS_OK; + } + } + + return BLK_STS_ZONE_OPEN_RESOURCE; +} + +/* + * This function matches the manage open zone resources function in the ZBC standard, + * with the addition of max active zones support (added in the ZNS standard). + * + * The function determines if a zone can transition to implicit open or explicit open, + * while maintaining the max open zone (and max active zone) limit(s). It may close an + * implicit open zone in order to make additional zone resources available. + * + * ZBC states that an implicit open zone shall be closed only if there is not + * room within the open limit. However, with the addition of an active limit, + * it is not certain that closing an implicit open zone will allow a new zone + * to be opened, since we might already be at the active limit capacity. + */ +static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +{ + blk_status_t ret; + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + ret = null_check_active(dev); + if (ret != BLK_STS_OK) + return ret; + fallthrough; + case BLK_ZONE_COND_CLOSED: + return null_check_open(dev); + default: + /* Should never be called for other states */ + WARN_ON(1); + return BLK_STS_IOERR; + } +} + +static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, + unsigned int nr_sectors, bool append) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zno = null_zone_no(dev, sector); + struct blk_zone *zone = &dev->zones[zno]; + blk_status_t ret; + + trace_nullb_zone_op(cmd, zno, zone->cond); + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (append) + return BLK_STS_IOERR; + return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + } + + null_lock_zone(dev, zno); + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* Cannot write to a full zone */ + ret = BLK_STS_IOERR; + goto unlock; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + goto unlock; + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + break; + default: + /* Invalid zone condition */ + ret = BLK_STS_IOERR; + goto unlock; + } + + /* + * Regular writes must be at the write pointer position. + * Zone append writes are automatically issued at the write + * pointer and the position returned using the request or BIO + * sector. + */ + if (append) { + sector = zone->wp; + if (cmd->bio) + cmd->bio->bi_iter.bi_sector = sector; + else + cmd->rq->__sector = sector; + } else if (sector != zone->wp) { + ret = BLK_STS_IOERR; + goto unlock; + } + + if (zone->wp + nr_sectors > zone->start + zone->capacity) { + ret = BLK_STS_IOERR; + goto unlock; + } + + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + /* + * Memory backing allocation may sleep: release the zone_lock spinlock + * to avoid scheduling in atomic context. Zone operation atomicity is + * still guaranteed through the zone_locks bitmap. + */ + if (dev->memory_backed) + spin_unlock_irq(&dev->zone_lock); + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + if (dev->memory_backed) + spin_lock_irq(&dev->zone_lock); + + if (ret != BLK_STS_OK) + goto unlock; + + zone->wp += nr_sectors; + if (zone->wp == zone->start + zone->capacity) { + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + zone->cond = BLK_ZONE_COND_FULL; + } + ret = BLK_STS_OK; + +unlock: + null_unlock_zone(dev, zno); + + return ret; +} + +static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + blk_status_t ret; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + /* open operation on exp open is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + dev->nr_zones_exp_open++; + + return BLK_STS_OK; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + blk_status_t ret; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* finish operation on full is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; + dev->nr_zones_closed--; + break; + default: + return BLK_STS_IOERR; + } + + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->len; + + return BLK_STS_OK; +} + +static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + /* reset operation on empty is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_FULL: + break; + default: + return BLK_STS_IOERR; + } + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + + return BLK_STS_OK; +} + +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zone_no; + struct blk_zone *zone; + blk_status_t ret; + size_t i; + + if (op == REQ_OP_ZONE_RESET_ALL) { + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + null_lock_zone(dev, i); + zone = &dev->zones[i]; + if (zone->cond != BLK_ZONE_COND_EMPTY) { + null_reset_zone(dev, zone); + trace_nullb_zone_op(cmd, i, zone->cond); + } + null_unlock_zone(dev, i); + } + return BLK_STS_OK; + } + + zone_no = null_zone_no(dev, sector); + zone = &dev->zones[zone_no]; + + null_lock_zone(dev, zone_no); + + switch (op) { + case REQ_OP_ZONE_RESET: + ret = null_reset_zone(dev, zone); + break; + case REQ_OP_ZONE_OPEN: + ret = null_open_zone(dev, zone); + break; + case REQ_OP_ZONE_CLOSE: + ret = null_close_zone(dev, zone); + break; + case REQ_OP_ZONE_FINISH: + ret = null_finish_zone(dev, zone); + break; + default: + ret = BLK_STS_NOTSUPP; + break; + } + + if (ret == BLK_STS_OK) + trace_nullb_zone_op(cmd, zone_no, zone->cond); + + null_unlock_zone(dev, zone_no); + + return ret; +} + +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector, sector_t nr_sectors) +{ + struct nullb_device *dev = cmd->nq->dev; + unsigned int zno = null_zone_no(dev, sector); + blk_status_t sts; + + switch (op) { + case REQ_OP_WRITE: + sts = null_zone_write(cmd, sector, nr_sectors, false); + break; + case REQ_OP_ZONE_APPEND: + sts = null_zone_write(cmd, sector, nr_sectors, true); + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + sts = null_zone_mgmt(cmd, op, sector); + break; + default: + null_lock_zone(dev, zno); + sts = null_process_cmd(cmd, op, sector, nr_sectors); + null_unlock_zone(dev, zno); + } + + return sts; +} diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c deleted file mode 100644 index c6ba8f9f3f31..000000000000 --- a/drivers/block/null_blk_main.c +++ /dev/null @@ -1,2036 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Add configfs and memory store: Kyungchan Koh <kkc6196@xxxxxx> and - * Shaohua Li <shli@xxxxxx> - */ -#include <linux/module.h> - -#include <linux/moduleparam.h> -#include <linux/sched.h> -#include <linux/fs.h> -#include <linux/init.h> -#include "null_blk.h" - -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - -#define FREE_BATCH 16 - -#define TICKS_PER_SEC 50ULL -#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -static DECLARE_FAULT_ATTR(null_timeout_attr); -static DECLARE_FAULT_ATTR(null_requeue_attr); -static DECLARE_FAULT_ATTR(null_init_hctx_attr); -#endif - -static inline u64 mb_per_tick(int mbps) -{ - return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); -} - -/* - * Status flags for nullb_device. - * - * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. - * UP: Device is currently on and visible in userspace. - * THROTTLED: Device is being throttled. - * CACHE: Device is using a write-back cache. - */ -enum nullb_device_flags { - NULLB_DEV_FL_CONFIGURED = 0, - NULLB_DEV_FL_UP = 1, - NULLB_DEV_FL_THROTTLED = 2, - NULLB_DEV_FL_CACHE = 3, -}; - -#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) -/* - * nullb_page is a page in memory for nullb devices. - * - * @page: The page holding the data. - * @bitmap: The bitmap represents which sector in the page has data. - * Each bit represents one block size. For example, sector 8 - * will use the 7th bit - * The highest 2 bits of bitmap are for special purpose. LOCK means the cache - * page is being flushing to storage. FREE means the cache page is freed and - * should be skipped from flushing to storage. Please see - * null_make_cache_space - */ -struct nullb_page { - struct page *page; - DECLARE_BITMAP(bitmap, MAP_SZ); -}; -#define NULLB_PAGE_LOCK (MAP_SZ - 1) -#define NULLB_PAGE_FREE (MAP_SZ - 2) - -static LIST_HEAD(nullb_list); -static struct mutex lock; -static int null_major; -static DEFINE_IDA(nullb_indexes); -static struct blk_mq_tag_set tag_set; - -enum { - NULL_IRQ_NONE = 0, - NULL_IRQ_SOFTIRQ = 1, - NULL_IRQ_TIMER = 2, -}; - -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - -static int g_no_sched; -module_param_named(no_sched, g_no_sched, int, 0444); -MODULE_PARM_DESC(no_sched, "No io scheduler"); - -static int g_submit_queues = 1; -module_param_named(submit_queues, g_submit_queues, int, 0444); -MODULE_PARM_DESC(submit_queues, "Number of submission queues"); - -static int g_home_node = NUMA_NO_NODE; -module_param_named(home_node, g_home_node, int, 0444); -MODULE_PARM_DESC(home_node, "Home node for the device"); - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -/* - * For more details about fault injection, please refer to - * Documentation/fault-injection/fault-injection.rst. - */ -static char g_timeout_str[80]; -module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); -MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); - -static char g_requeue_str[80]; -module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); -MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); - -static char g_init_hctx_str[80]; -module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); -MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); -#endif - -static int g_queue_mode = NULL_Q_MQ; - -static int null_param_store_val(const char *str, int *val, int min, int max) -{ - int ret, new_val; - - ret = kstrtoint(str, 10, &new_val); - if (ret) - return -EINVAL; - - if (new_val < min || new_val > max) - return -EINVAL; - - *val = new_val; - return 0; -} - -static int null_set_queue_mode(const char *str, const struct kernel_param *kp) -{ - return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); -} - -static const struct kernel_param_ops null_queue_mode_param_ops = { - .set = null_set_queue_mode, - .get = param_get_int, -}; - -device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); -MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); - -static int g_gb = 250; -module_param_named(gb, g_gb, int, 0444); -MODULE_PARM_DESC(gb, "Size in GB"); - -static int g_bs = 512; -module_param_named(bs, g_bs, int, 0444); -MODULE_PARM_DESC(bs, "Block size (in bytes)"); - -static unsigned int nr_devices = 1; -module_param(nr_devices, uint, 0444); -MODULE_PARM_DESC(nr_devices, "Number of devices to register"); - -static bool g_blocking; -module_param_named(blocking, g_blocking, bool, 0444); -MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); - -static bool shared_tags; -module_param(shared_tags, bool, 0444); -MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); - -static bool g_shared_tag_bitmap; -module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); -MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); - -static int g_irqmode = NULL_IRQ_SOFTIRQ; - -static int null_set_irqmode(const char *str, const struct kernel_param *kp) -{ - return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, - NULL_IRQ_TIMER); -} - -static const struct kernel_param_ops null_irqmode_param_ops = { - .set = null_set_irqmode, - .get = param_get_int, -}; - -device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); -MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); - -static unsigned long g_completion_nsec = 10000; -module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); -MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); - -static int g_hw_queue_depth = 64; -module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); -MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); - -static bool g_use_per_node_hctx; -module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); -MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); - -static bool g_zoned; -module_param_named(zoned, g_zoned, bool, S_IRUGO); -MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); - -static unsigned long g_zone_size = 256; -module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); - -static unsigned long g_zone_capacity; -module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); -MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); - -static unsigned int g_zone_nr_conv; -module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); -MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); - -static unsigned int g_zone_max_open; -module_param_named(zone_max_open, g_zone_max_open, uint, 0444); -MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); - -static unsigned int g_zone_max_active; -module_param_named(zone_max_active, g_zone_max_active, uint, 0444); -MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); - -static struct nullb_device *null_alloc_dev(void); -static void null_free_dev(struct nullb_device *dev); -static void null_del_dev(struct nullb *nullb); -static int null_add_dev(struct nullb_device *dev); -static void null_free_device_storage(struct nullb_device *dev, bool is_cache); - -static inline struct nullb_device *to_nullb_device(struct config_item *item) -{ - return item ? container_of(item, struct nullb_device, item) : NULL; -} - -static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) -{ - return snprintf(page, PAGE_SIZE, "%u\n", val); -} - -static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, - char *page) -{ - return snprintf(page, PAGE_SIZE, "%lu\n", val); -} - -static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) -{ - return snprintf(page, PAGE_SIZE, "%u\n", val); -} - -static ssize_t nullb_device_uint_attr_store(unsigned int *val, - const char *page, size_t count) -{ - unsigned int tmp; - int result; - - result = kstrtouint(page, 0, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -static ssize_t nullb_device_ulong_attr_store(unsigned long *val, - const char *page, size_t count) -{ - int result; - unsigned long tmp; - - result = kstrtoul(page, 0, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, - size_t count) -{ - bool tmp; - int result; - - result = kstrtobool(page, &tmp); - if (result < 0) - return result; - - *val = tmp; - return count; -} - -/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ -#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ -static ssize_t \ -nullb_device_##NAME##_show(struct config_item *item, char *page) \ -{ \ - return nullb_device_##TYPE##_attr_show( \ - to_nullb_device(item)->NAME, page); \ -} \ -static ssize_t \ -nullb_device_##NAME##_store(struct config_item *item, const char *page, \ - size_t count) \ -{ \ - int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ - struct nullb_device *dev = to_nullb_device(item); \ - TYPE new_value = 0; \ - int ret; \ - \ - ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ - if (ret < 0) \ - return ret; \ - if (apply_fn) \ - ret = apply_fn(dev, new_value); \ - else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ - ret = -EBUSY; \ - if (ret < 0) \ - return ret; \ - dev->NAME = new_value; \ - return count; \ -} \ -CONFIGFS_ATTR(nullb_device_, NAME); - -static int nullb_apply_submit_queues(struct nullb_device *dev, - unsigned int submit_queues) -{ - struct nullb *nullb = dev->nullb; - struct blk_mq_tag_set *set; - - if (!nullb) - return 0; - - /* - * Make sure that null_init_hctx() does not access nullb->queues[] past - * the end of that array. - */ - if (submit_queues > nr_cpu_ids) - return -EINVAL; - set = nullb->tag_set; - blk_mq_update_nr_hw_queues(set, submit_queues); - return set->nr_hw_queues == submit_queues ? 0 : -ENOMEM; -} - -NULLB_DEVICE_ATTR(size, ulong, NULL); -NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); -NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); -NULLB_DEVICE_ATTR(home_node, uint, NULL); -NULLB_DEVICE_ATTR(queue_mode, uint, NULL); -NULLB_DEVICE_ATTR(blocksize, uint, NULL); -NULLB_DEVICE_ATTR(irqmode, uint, NULL); -NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); -NULLB_DEVICE_ATTR(index, uint, NULL); -NULLB_DEVICE_ATTR(blocking, bool, NULL); -NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); -NULLB_DEVICE_ATTR(memory_backed, bool, NULL); -NULLB_DEVICE_ATTR(discard, bool, NULL); -NULLB_DEVICE_ATTR(mbps, uint, NULL); -NULLB_DEVICE_ATTR(cache_size, ulong, NULL); -NULLB_DEVICE_ATTR(zoned, bool, NULL); -NULLB_DEVICE_ATTR(zone_size, ulong, NULL); -NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); -NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); -NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); -NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); - -static ssize_t nullb_device_power_show(struct config_item *item, char *page) -{ - return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); -} - -static ssize_t nullb_device_power_store(struct config_item *item, - const char *page, size_t count) -{ - struct nullb_device *dev = to_nullb_device(item); - bool newp = false; - ssize_t ret; - - ret = nullb_device_bool_attr_store(&newp, page, count); - if (ret < 0) - return ret; - - if (!dev->power && newp) { - if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) - return count; - if (null_add_dev(dev)) { - clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return -ENOMEM; - } - - set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); - dev->power = newp; - } else if (dev->power && !newp) { - if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); - dev->power = newp; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - } - clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); - } - - return count; -} - -CONFIGFS_ATTR(nullb_device_, power); - -static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) -{ - struct nullb_device *t_dev = to_nullb_device(item); - - return badblocks_show(&t_dev->badblocks, page, 0); -} - -static ssize_t nullb_device_badblocks_store(struct config_item *item, - const char *page, size_t count) -{ - struct nullb_device *t_dev = to_nullb_device(item); - char *orig, *buf, *tmp; - u64 start, end; - int ret; - - orig = kstrndup(page, count, GFP_KERNEL); - if (!orig) - return -ENOMEM; - - buf = strstrip(orig); - - ret = -EINVAL; - if (buf[0] != '+' && buf[0] != '-') - goto out; - tmp = strchr(&buf[1], '-'); - if (!tmp) - goto out; - *tmp = '\0'; - ret = kstrtoull(buf + 1, 0, &start); - if (ret) - goto out; - ret = kstrtoull(tmp + 1, 0, &end); - if (ret) - goto out; - ret = -EINVAL; - if (start > end) - goto out; - /* enable badblocks */ - cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) - ret = count; -out: - kfree(orig); - return ret; -} -CONFIGFS_ATTR(nullb_device_, badblocks); - -static struct configfs_attribute *nullb_device_attrs[] = { - &nullb_device_attr_size, - &nullb_device_attr_completion_nsec, - &nullb_device_attr_submit_queues, - &nullb_device_attr_home_node, - &nullb_device_attr_queue_mode, - &nullb_device_attr_blocksize, - &nullb_device_attr_irqmode, - &nullb_device_attr_hw_queue_depth, - &nullb_device_attr_index, - &nullb_device_attr_blocking, - &nullb_device_attr_use_per_node_hctx, - &nullb_device_attr_power, - &nullb_device_attr_memory_backed, - &nullb_device_attr_discard, - &nullb_device_attr_mbps, - &nullb_device_attr_cache_size, - &nullb_device_attr_badblocks, - &nullb_device_attr_zoned, - &nullb_device_attr_zone_size, - &nullb_device_attr_zone_capacity, - &nullb_device_attr_zone_nr_conv, - &nullb_device_attr_zone_max_open, - &nullb_device_attr_zone_max_active, - NULL, -}; - -static void nullb_device_release(struct config_item *item) -{ - struct nullb_device *dev = to_nullb_device(item); - - null_free_device_storage(dev, false); - null_free_dev(dev); -} - -static struct configfs_item_operations nullb_device_ops = { - .release = nullb_device_release, -}; - -static const struct config_item_type nullb_device_type = { - .ct_item_ops = &nullb_device_ops, - .ct_attrs = nullb_device_attrs, - .ct_owner = THIS_MODULE, -}; - -static struct -config_item *nullb_group_make_item(struct config_group *group, const char *name) -{ - struct nullb_device *dev; - - dev = null_alloc_dev(); - if (!dev) - return ERR_PTR(-ENOMEM); - - config_item_init_type_name(&dev->item, name, &nullb_device_type); - - return &dev->item; -} - -static void -nullb_group_drop_item(struct config_group *group, struct config_item *item) -{ - struct nullb_device *dev = to_nullb_device(item); - - if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); - dev->power = false; - null_del_dev(dev->nullb); - mutex_unlock(&lock); - } - - config_item_put(item); -} - -static ssize_t memb_group_features_show(struct config_item *item, char *page) -{ - return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); -} - -CONFIGFS_ATTR_RO(memb_group_, features); - -static struct configfs_attribute *nullb_group_attrs[] = { - &memb_group_attr_features, - NULL, -}; - -static struct configfs_group_operations nullb_group_ops = { - .make_item = nullb_group_make_item, - .drop_item = nullb_group_drop_item, -}; - -static const struct config_item_type nullb_group_type = { - .ct_group_ops = &nullb_group_ops, - .ct_attrs = nullb_group_attrs, - .ct_owner = THIS_MODULE, -}; - -static struct configfs_subsystem nullb_subsys = { - .su_group = { - .cg_item = { - .ci_namebuf = "nullb", - .ci_type = &nullb_group_type, - }, - }, -}; - -static inline int null_cache_active(struct nullb *nullb) -{ - return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); -} - -static struct nullb_device *null_alloc_dev(void) -{ - struct nullb_device *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return NULL; - INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); - INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); - if (badblocks_init(&dev->badblocks, 0)) { - kfree(dev); - return NULL; - } - - dev->size = g_gb * 1024; - dev->completion_nsec = g_completion_nsec; - dev->submit_queues = g_submit_queues; - dev->home_node = g_home_node; - dev->queue_mode = g_queue_mode; - dev->blocksize = g_bs; - dev->irqmode = g_irqmode; - dev->hw_queue_depth = g_hw_queue_depth; - dev->blocking = g_blocking; - dev->use_per_node_hctx = g_use_per_node_hctx; - dev->zoned = g_zoned; - dev->zone_size = g_zone_size; - dev->zone_capacity = g_zone_capacity; - dev->zone_nr_conv = g_zone_nr_conv; - dev->zone_max_open = g_zone_max_open; - dev->zone_max_active = g_zone_max_active; - return dev; -} - -static void null_free_dev(struct nullb_device *dev) -{ - if (!dev) - return; - - null_free_zoned_dev(dev); - badblocks_exit(&dev->badblocks); - kfree(dev); -} - -static void put_tag(struct nullb_queue *nq, unsigned int tag) -{ - clear_bit_unlock(tag, nq->tag_map); - - if (waitqueue_active(&nq->wait)) - wake_up(&nq->wait); -} - -static unsigned int get_tag(struct nullb_queue *nq) -{ - unsigned int tag; - - do { - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); - if (tag >= nq->queue_depth) - return -1U; - } while (test_and_set_bit_lock(tag, nq->tag_map)); - - return tag; -} - -static void free_cmd(struct nullb_cmd *cmd) -{ - put_tag(cmd->nq, cmd->tag); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); - -static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - unsigned int tag; - - tag = get_tag(nq); - if (tag != -1U) { - cmd = &nq->cmds[tag]; - cmd->tag = tag; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - return cmd; - } - - return NULL; -} - -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) -{ - struct nullb_cmd *cmd; - DEFINE_WAIT(wait); - - cmd = __alloc_cmd(nq); - if (cmd || !can_wait) - return cmd; - - do { - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); - cmd = __alloc_cmd(nq); - if (cmd) - break; - - io_schedule(); - } while (1); - - finish_wait(&nq->wait, &wait); - return cmd; -} - -static void end_cmd(struct nullb_cmd *cmd) -{ - int queue_mode = cmd->nq->dev->queue_mode; - - switch (queue_mode) { - case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, cmd->error); - return; - case NULL_Q_BIO: - cmd->bio->bi_status = cmd->error; - bio_endio(cmd->bio); - break; - } - - free_cmd(cmd); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) -{ - end_cmd(container_of(timer, struct nullb_cmd, timer)); - - return HRTIMER_NORESTART; -} - -static void null_cmd_end_timer(struct nullb_cmd *cmd) -{ - ktime_t kt = cmd->nq->dev->completion_nsec; - - hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); -} - -static void null_complete_rq(struct request *rq) -{ - end_cmd(blk_mq_rq_to_pdu(rq)); -} - -static struct nullb_page *null_alloc_page(gfp_t gfp_flags) -{ - struct nullb_page *t_page; - - t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); - if (!t_page) - goto out; - - t_page->page = alloc_pages(gfp_flags, 0); - if (!t_page->page) - goto out_freepage; - - memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); - return t_page; -out_freepage: - kfree(t_page); -out: - return NULL; -} - -static void null_free_page(struct nullb_page *t_page) -{ - __set_bit(NULLB_PAGE_FREE, t_page->bitmap); - if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) - return; - __free_page(t_page->page); - kfree(t_page); -} - -static bool null_page_empty(struct nullb_page *page) -{ - int size = MAP_SZ - 2; - - return find_first_bit(page->bitmap, size) == size; -} - -static void null_free_sector(struct nullb *nullb, sector_t sector, - bool is_cache) -{ - unsigned int sector_bit; - u64 idx; - struct nullb_page *t_page, *ret; - struct radix_tree_root *root; - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - idx = sector >> PAGE_SECTORS_SHIFT; - sector_bit = (sector & SECTOR_MASK); - - t_page = radix_tree_lookup(root, idx); - if (t_page) { - __clear_bit(sector_bit, t_page->bitmap); - - if (null_page_empty(t_page)) { - ret = radix_tree_delete_item(root, idx, t_page); - WARN_ON(ret != t_page); - null_free_page(ret); - if (is_cache) - nullb->dev->curr_cache -= PAGE_SIZE; - } - } -} - -static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, - struct nullb_page *t_page, bool is_cache) -{ - struct radix_tree_root *root; - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - - if (radix_tree_insert(root, idx, t_page)) { - null_free_page(t_page); - t_page = radix_tree_lookup(root, idx); - WARN_ON(!t_page || t_page->page->index != idx); - } else if (is_cache) - nullb->dev->curr_cache += PAGE_SIZE; - - return t_page; -} - -static void null_free_device_storage(struct nullb_device *dev, bool is_cache) -{ - unsigned long pos = 0; - int nr_pages; - struct nullb_page *ret, *t_pages[FREE_BATCH]; - struct radix_tree_root *root; - - root = is_cache ? &dev->cache : &dev->data; - - do { - int i; - - nr_pages = radix_tree_gang_lookup(root, - (void **)t_pages, pos, FREE_BATCH); - - for (i = 0; i < nr_pages; i++) { - pos = t_pages[i]->page->index; - ret = radix_tree_delete_item(root, pos, t_pages[i]); - WARN_ON(ret != t_pages[i]); - null_free_page(ret); - } - - pos++; - } while (nr_pages == FREE_BATCH); - - if (is_cache) - dev->curr_cache = 0; -} - -static struct nullb_page *__null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write, bool is_cache) -{ - unsigned int sector_bit; - u64 idx; - struct nullb_page *t_page; - struct radix_tree_root *root; - - idx = sector >> PAGE_SECTORS_SHIFT; - sector_bit = (sector & SECTOR_MASK); - - root = is_cache ? &nullb->dev->cache : &nullb->dev->data; - t_page = radix_tree_lookup(root, idx); - WARN_ON(t_page && t_page->page->index != idx); - - if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) - return t_page; - - return NULL; -} - -static struct nullb_page *null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write, bool ignore_cache) -{ - struct nullb_page *page = NULL; - - if (!ignore_cache) - page = __null_lookup_page(nullb, sector, for_write, true); - if (page) - return page; - return __null_lookup_page(nullb, sector, for_write, false); -} - -static struct nullb_page *null_insert_page(struct nullb *nullb, - sector_t sector, bool ignore_cache) - __releases(&nullb->lock) - __acquires(&nullb->lock) -{ - u64 idx; - struct nullb_page *t_page; - - t_page = null_lookup_page(nullb, sector, true, ignore_cache); - if (t_page) - return t_page; - - spin_unlock_irq(&nullb->lock); - - t_page = null_alloc_page(GFP_NOIO); - if (!t_page) - goto out_lock; - - if (radix_tree_preload(GFP_NOIO)) - goto out_freepage; - - spin_lock_irq(&nullb->lock); - idx = sector >> PAGE_SECTORS_SHIFT; - t_page->page->index = idx; - t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); - radix_tree_preload_end(); - - return t_page; -out_freepage: - null_free_page(t_page); -out_lock: - spin_lock_irq(&nullb->lock); - return null_lookup_page(nullb, sector, true, ignore_cache); -} - -static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) -{ - int i; - unsigned int offset; - u64 idx; - struct nullb_page *t_page, *ret; - void *dst, *src; - - idx = c_page->page->index; - - t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); - - __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); - if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { - null_free_page(c_page); - if (t_page && null_page_empty(t_page)) { - ret = radix_tree_delete_item(&nullb->dev->data, - idx, t_page); - null_free_page(t_page); - } - return 0; - } - - if (!t_page) - return -ENOMEM; - - src = kmap_atomic(c_page->page); - dst = kmap_atomic(t_page->page); - - for (i = 0; i < PAGE_SECTORS; - i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { - if (test_bit(i, c_page->bitmap)) { - offset = (i << SECTOR_SHIFT); - memcpy(dst + offset, src + offset, - nullb->dev->blocksize); - __set_bit(i, t_page->bitmap); - } - } - - kunmap_atomic(dst); - kunmap_atomic(src); - - ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); - null_free_page(ret); - nullb->dev->curr_cache -= PAGE_SIZE; - - return 0; -} - -static int null_make_cache_space(struct nullb *nullb, unsigned long n) -{ - int i, err, nr_pages; - struct nullb_page *c_pages[FREE_BATCH]; - unsigned long flushed = 0, one_round; - -again: - if ((nullb->dev->cache_size * 1024 * 1024) > - nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) - return 0; - - nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, - (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); - /* - * nullb_flush_cache_page could unlock before using the c_pages. To - * avoid race, we don't allow page free - */ - for (i = 0; i < nr_pages; i++) { - nullb->cache_flush_pos = c_pages[i]->page->index; - /* - * We found the page which is being flushed to disk by other - * threads - */ - if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) - c_pages[i] = NULL; - else - __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); - } - - one_round = 0; - for (i = 0; i < nr_pages; i++) { - if (c_pages[i] == NULL) - continue; - err = null_flush_cache_page(nullb, c_pages[i]); - if (err) - return err; - one_round++; - } - flushed += one_round << PAGE_SHIFT; - - if (n > flushed) { - if (nr_pages == 0) - nullb->cache_flush_pos = 0; - if (one_round == 0) { - /* give other threads a chance */ - spin_unlock_irq(&nullb->lock); - spin_lock_irq(&nullb->lock); - } - goto again; - } - return 0; -} - -static int copy_to_nullb(struct nullb *nullb, struct page *source, - unsigned int off, sector_t sector, size_t n, bool is_fua) -{ - size_t temp, count = 0; - unsigned int offset; - struct nullb_page *t_page; - void *dst, *src; - - while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); - - if (null_cache_active(nullb) && !is_fua) - null_make_cache_space(nullb, PAGE_SIZE); - - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_insert_page(nullb, sector, - !null_cache_active(nullb) || is_fua); - if (!t_page) - return -ENOSPC; - - src = kmap_atomic(source); - dst = kmap_atomic(t_page->page); - memcpy(dst + offset, src + off + count, temp); - kunmap_atomic(dst); - kunmap_atomic(src); - - __set_bit(sector & SECTOR_MASK, t_page->bitmap); - - if (is_fua) - null_free_sector(nullb, sector, true); - - count += temp; - sector += temp >> SECTOR_SHIFT; - } - return 0; -} - -static int copy_from_nullb(struct nullb *nullb, struct page *dest, - unsigned int off, sector_t sector, size_t n) -{ - size_t temp, count = 0; - unsigned int offset; - struct nullb_page *t_page; - void *dst, *src; - - while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); - - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_lookup_page(nullb, sector, false, - !null_cache_active(nullb)); - - dst = kmap_atomic(dest); - if (!t_page) { - memset(dst + off + count, 0, temp); - goto next; - } - src = kmap_atomic(t_page->page); - memcpy(dst + off + count, src + offset, temp); - kunmap_atomic(src); -next: - kunmap_atomic(dst); - - count += temp; - sector += temp >> SECTOR_SHIFT; - } - return 0; -} - -static void nullb_fill_pattern(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off) -{ - void *dst; - - dst = kmap_atomic(page); - memset(dst + off, 0xFF, len); - kunmap_atomic(dst); -} - -static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) -{ - size_t temp; - - spin_lock_irq(&nullb->lock); - while (n > 0) { - temp = min_t(size_t, n, nullb->dev->blocksize); - null_free_sector(nullb, sector, false); - if (null_cache_active(nullb)) - null_free_sector(nullb, sector, true); - sector += temp >> SECTOR_SHIFT; - n -= temp; - } - spin_unlock_irq(&nullb->lock); -} - -static int null_handle_flush(struct nullb *nullb) -{ - int err; - - if (!null_cache_active(nullb)) - return 0; - - spin_lock_irq(&nullb->lock); - while (true) { - err = null_make_cache_space(nullb, - nullb->dev->cache_size * 1024 * 1024); - if (err || nullb->dev->curr_cache == 0) - break; - } - - WARN_ON(!radix_tree_empty(&nullb->dev->cache)); - spin_unlock_irq(&nullb->lock); - return err; -} - -static int null_transfer(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off, bool is_write, sector_t sector, - bool is_fua) -{ - struct nullb_device *dev = nullb->dev; - unsigned int valid_len = len; - int err = 0; - - if (!is_write) { - if (dev->zoned) - valid_len = null_zone_valid_read_len(nullb, - sector, len); - - if (valid_len) { - err = copy_from_nullb(nullb, page, off, - sector, valid_len); - off += valid_len; - len -= valid_len; - } - - if (len) - nullb_fill_pattern(nullb, page, len, off); - flush_dcache_page(page); - } else { - flush_dcache_page(page); - err = copy_to_nullb(nullb, page, off, sector, len, is_fua); - } - - return err; -} - -static int null_handle_rq(struct nullb_cmd *cmd) -{ - struct request *rq = cmd->rq; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector; - struct req_iterator iter; - struct bio_vec bvec; - - sector = blk_rq_pos(rq); - - if (req_op(rq) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, blk_rq_bytes(rq)); - return 0; - } - - spin_lock_irq(&nullb->lock); - rq_for_each_segment(bvec, rq, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector, - rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - - return 0; -} - -static int null_handle_bio(struct nullb_cmd *cmd) -{ - struct bio *bio = cmd->bio; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector; - struct bio_vec bvec; - struct bvec_iter iter; - - sector = bio->bi_iter.bi_sector; - - if (bio_op(bio) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, - bio_sectors(bio) << SECTOR_SHIFT); - return 0; - } - - spin_lock_irq(&nullb->lock); - bio_for_each_segment(bvec, bio, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector, - bio->bi_opf & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - return 0; -} - -static void null_stop_queue(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_stop_hw_queues(q); -} - -static void null_restart_queue_async(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_start_stopped_hw_queues(q, true); -} - -static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) -{ - struct nullb_device *dev = cmd->nq->dev; - struct nullb *nullb = dev->nullb; - blk_status_t sts = BLK_STS_OK; - struct request *rq = cmd->rq; - - if (!hrtimer_active(&nullb->bw_timer)) - hrtimer_restart(&nullb->bw_timer); - - if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); - /* race with timer */ - if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); - /* requeue request */ - sts = BLK_STS_DEV_RESOURCE; - } - return sts; -} - -static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, - sector_t sector, - sector_t nr_sectors) -{ - struct badblocks *bb = &cmd->nq->dev->badblocks; - sector_t first_bad; - int bad_sectors; - - if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) - return BLK_STS_IOERR; - - return BLK_STS_OK; -} - -static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op) -{ - struct nullb_device *dev = cmd->nq->dev; - int err; - - if (dev->queue_mode == NULL_Q_BIO) - err = null_handle_bio(cmd); - else - err = null_handle_rq(cmd); - - return errno_to_blk_status(err); -} - -static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) -{ - struct nullb_device *dev = cmd->nq->dev; - struct bio *bio; - - if (dev->memory_backed) - return; - - if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { - zero_fill_bio(cmd->bio); - } else if (req_op(cmd->rq) == REQ_OP_READ) { - __rq_for_each_bio(bio, cmd->rq) - zero_fill_bio(bio); - } -} - -static inline void nullb_complete_cmd(struct nullb_cmd *cmd) -{ - /* - * Since root privileges are required to configure the null_blk - * driver, it is fine that this driver does not initialize the - * data buffers of read commands. Zero-initialize these buffers - * anyway if KMSAN is enabled to prevent that KMSAN complains - * about null_blk not initializing read data buffers. - */ - if (IS_ENABLED(CONFIG_KMSAN)) - nullb_zero_read_cmd_buffer(cmd); - - /* Complete IO by inline, softirq or timer */ - switch (cmd->nq->dev->irqmode) { - case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { - case NULL_Q_MQ: - if (likely(!blk_should_fake_timeout(cmd->rq->q))) - blk_mq_complete_request(cmd->rq); - break; - case NULL_Q_BIO: - /* - * XXX: no proper submitting cpu information available. - */ - end_cmd(cmd); - break; - } - break; - case NULL_IRQ_NONE: - end_cmd(cmd); - break; - case NULL_IRQ_TIMER: - null_cmd_end_timer(cmd); - break; - } -} - -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors) -{ - struct nullb_device *dev = cmd->nq->dev; - blk_status_t ret; - - if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); - if (ret != BLK_STS_OK) - return ret; - } - - if (dev->memory_backed) - return null_handle_memory_backed(cmd, op); - - return BLK_STS_OK; -} - -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_opf op) -{ - struct nullb_device *dev = cmd->nq->dev; - struct nullb *nullb = dev->nullb; - blk_status_t sts; - - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - sts = null_handle_throttled(cmd); - if (sts != BLK_STS_OK) - return sts; - } - - if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); - goto out; - } - - if (dev->zoned) - sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); - else - sts = null_process_cmd(cmd, op, sector, nr_sectors); - - /* Do not overwrite errors (e.g. timeout errors) */ - if (cmd->error == BLK_STS_OK) - cmd->error = sts; - -out: - nullb_complete_cmd(cmd); - return BLK_STS_OK; -} - -static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) -{ - struct nullb *nullb = container_of(timer, struct nullb, bw_timer); - ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - unsigned int mbps = nullb->dev->mbps; - - if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) - return HRTIMER_NORESTART; - - atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); - null_restart_queue_async(nullb); - - hrtimer_forward_now(&nullb->bw_timer, timer_interval); - - return HRTIMER_RESTART; -} - -static void nullb_setup_bwtimer(struct nullb *nullb) -{ - ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - - hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - nullb->bw_timer.function = nullb_bwtimer_fn; - atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); - hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); -} - -static struct nullb_queue *nullb_to_queue(struct nullb *nullb) -{ - int index = 0; - - if (nullb->nr_queues != 1) - index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); - - return &nullb->queues[index]; -} - -static blk_qc_t null_submit_bio(struct bio *bio) -{ - sector_t sector = bio->bi_iter.bi_sector; - sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_disk->private_data; - struct nullb_queue *nq = nullb_to_queue(nullb); - struct nullb_cmd *cmd; - - cmd = alloc_cmd(nq, 1); - cmd->bio = bio; - - null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); - return BLK_QC_T_NONE; -} - -static bool should_timeout_request(struct request *rq) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_timeout_str[0]) - return should_fail(&null_timeout_attr, 1); -#endif - return false; -} - -static bool should_requeue_request(struct request *rq) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_requeue_str[0]) - return should_fail(&null_requeue_attr, 1); -#endif - return false; -} - -static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) -{ - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); - - pr_info("rq %p timed out\n", rq); - - /* - * If the device is marked as blocking (i.e. memory backed or zoned - * device), the submission path may be blocked waiting for resources - * and cause real timeouts. For these real timeouts, the submission - * path will complete the request using blk_mq_complete_request(). - * Only fake timeouts need to execute blk_mq_complete_request() here. - */ - cmd->error = BLK_STS_TIMEOUT; - if (cmd->fake_timeout) - blk_mq_complete_request(rq); - return BLK_EH_DONE; -} - -static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - struct nullb_queue *nq = hctx->driver_data; - sector_t nr_sectors = blk_rq_sectors(bd->rq); - sector_t sector = blk_rq_pos(bd->rq); - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - cmd->rq = bd->rq; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - cmd->fake_timeout = should_timeout_request(bd->rq); - - blk_mq_start_request(bd->rq); - - if (should_requeue_request(bd->rq)) { - /* - * Alternate between hitting the core BUSY path, and the - * driver driven requeue path - */ - nq->requeue_selection++; - if (nq->requeue_selection & 1) - return BLK_STS_RESOURCE; - else { - blk_mq_requeue_request(bd->rq, true); - return BLK_STS_OK; - } - } - if (cmd->fake_timeout) - return BLK_STS_OK; - - return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); -} - -static void cleanup_queue(struct nullb_queue *nq) -{ - kfree(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - -static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nullb_queue *nq = hctx->driver_data; - struct nullb *nullb = nq->dev->nullb; - - nullb->nr_queues--; -} - -static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) -{ - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; - nq->dev = nullb->dev; -} - -static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, - unsigned int hctx_idx) -{ - struct nullb *nullb = hctx->queue->queuedata; - struct nullb_queue *nq; - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) - return -EFAULT; -#endif - - nq = &nullb->queues[hctx_idx]; - hctx->driver_data = nq; - null_init_queue(nullb, nq); - nullb->nr_queues++; - - return 0; -} - -static const struct blk_mq_ops null_mq_ops = { - .queue_rq = null_queue_rq, - .complete = null_complete_rq, - .timeout = null_timeout_rq, - .init_hctx = null_init_hctx, - .exit_hctx = null_exit_hctx, -}; - -static void null_del_dev(struct nullb *nullb) -{ - struct nullb_device *dev; - - if (!nullb) - return; - - dev = nullb->dev; - - ida_simple_remove(&nullb_indexes, nullb->index); - - list_del_init(&nullb->list); - - del_gendisk(nullb->disk); - - if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { - hrtimer_cancel(&nullb->bw_timer); - atomic_long_set(&nullb->cur_bytes, LONG_MAX); - null_restart_queue_async(nullb); - } - - blk_cleanup_queue(nullb->q); - if (dev->queue_mode == NULL_Q_MQ && - nullb->tag_set == &nullb->__tag_set) - blk_mq_free_tag_set(nullb->tag_set); - put_disk(nullb->disk); - cleanup_queues(nullb); - if (null_cache_active(nullb)) - null_free_device_storage(nullb->dev, true); - kfree(nullb); - dev->nullb = NULL; -} - -static void null_config_discard(struct nullb *nullb) -{ - if (nullb->dev->discard == false) - return; - - if (nullb->dev->zoned) { - nullb->dev->discard = false; - pr_info("discard option is ignored in zoned mode\n"); - return; - } - - nullb->q->limits.discard_granularity = nullb->dev->blocksize; - nullb->q->limits.discard_alignment = nullb->dev->blocksize; - blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); -} - -static const struct block_device_operations null_bio_ops = { - .owner = THIS_MODULE, - .submit_bio = null_submit_bio, - .report_zones = null_report_zones, -}; - -static const struct block_device_operations null_rq_ops = { - .owner = THIS_MODULE, - .report_zones = null_report_zones, -}; - -static int setup_commands(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - int i, tag_size; - - nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); - if (!nq->cmds) - return -ENOMEM; - - tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; - nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); - if (!nq->tag_map) { - kfree(nq->cmds); - return -ENOMEM; - } - - for (i = 0; i < nq->queue_depth; i++) { - cmd = &nq->cmds[i]; - cmd->tag = -1U; - } - - return 0; -} - -static int setup_queues(struct nullb *nullb) -{ - nullb->queues = kcalloc(nr_cpu_ids, sizeof(struct nullb_queue), - GFP_KERNEL); - if (!nullb->queues) - return -ENOMEM; - - nullb->queue_depth = nullb->dev->hw_queue_depth; - - return 0; -} - -static int init_driver_queues(struct nullb *nullb) -{ - struct nullb_queue *nq; - int i, ret = 0; - - for (i = 0; i < nullb->dev->submit_queues; i++) { - nq = &nullb->queues[i]; - - null_init_queue(nullb, nq); - - ret = setup_commands(nq); - if (ret) - return ret; - nullb->nr_queues++; - } - return 0; -} - -static int null_gendisk_register(struct nullb *nullb) -{ - sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; - struct gendisk *disk; - - disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); - if (!disk) - return -ENOMEM; - set_capacity(disk, size); - - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->major = null_major; - disk->first_minor = nullb->index; - if (queue_is_mq(nullb->q)) - disk->fops = &null_rq_ops; - else - disk->fops = &null_bio_ops; - disk->private_data = nullb; - disk->queue = nullb->q; - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); - - if (nullb->dev->zoned) { - int ret = null_register_zoned_dev(nullb); - - if (ret) - return ret; - } - - add_disk(disk); - return 0; -} - -static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) -{ - set->ops = &null_mq_ops; - set->nr_hw_queues = nullb ? nullb->dev->submit_queues : - g_submit_queues; - set->queue_depth = nullb ? nullb->dev->hw_queue_depth : - g_hw_queue_depth; - set->numa_node = nullb ? nullb->dev->home_node : g_home_node; - set->cmd_size = sizeof(struct nullb_cmd); - set->flags = BLK_MQ_F_SHOULD_MERGE; - if (g_no_sched) - set->flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) - set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; - set->driver_data = NULL; - - if ((nullb && nullb->dev->blocking) || g_blocking) - set->flags |= BLK_MQ_F_BLOCKING; - - return blk_mq_alloc_tag_set(set); -} - -static int null_validate_conf(struct nullb_device *dev) -{ - dev->blocksize = round_down(dev->blocksize, 512); - dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); - - if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { - if (dev->submit_queues != nr_online_nodes) - dev->submit_queues = nr_online_nodes; - } else if (dev->submit_queues > nr_cpu_ids) - dev->submit_queues = nr_cpu_ids; - else if (dev->submit_queues == 0) - dev->submit_queues = 1; - - dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); - dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); - - /* Do memory allocation, so set blocking */ - if (dev->memory_backed) - dev->blocking = true; - else /* cache is meaningless */ - dev->cache_size = 0; - dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, - dev->cache_size); - dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); - /* can not stop a queue */ - if (dev->queue_mode == NULL_Q_BIO) - dev->mbps = 0; - - if (dev->zoned && - (!dev->zone_size || !is_power_of_2(dev->zone_size))) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - - return 0; -} - -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION -static bool __null_setup_fault(struct fault_attr *attr, char *str) -{ - if (!str[0]) - return true; - - if (!setup_fault_attr(attr, str)) - return false; - - attr->verbose = 0; - return true; -} -#endif - -static bool null_setup_fault(void) -{ -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) - return false; - if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) - return false; - if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) - return false; -#endif - return true; -} - -static int null_add_dev(struct nullb_device *dev) -{ - struct nullb *nullb; - int rv; - - rv = null_validate_conf(dev); - if (rv) - return rv; - - nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); - if (!nullb) { - rv = -ENOMEM; - goto out; - } - nullb->dev = dev; - dev->nullb = nullb; - - spin_lock_init(&nullb->lock); - - rv = setup_queues(nullb); - if (rv) - goto out_free_nullb; - - if (dev->queue_mode == NULL_Q_MQ) { - if (shared_tags) { - nullb->tag_set = &tag_set; - rv = 0; - } else { - nullb->tag_set = &nullb->__tag_set; - rv = null_init_tag_set(nullb, nullb->tag_set); - } - - if (rv) - goto out_cleanup_queues; - - if (!null_setup_fault()) - goto out_cleanup_queues; - - nullb->tag_set->timeout = 5 * HZ; - nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); - if (IS_ERR(nullb->q)) { - rv = -ENOMEM; - goto out_cleanup_tags; - } - } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue(dev->home_node); - if (!nullb->q) { - rv = -ENOMEM; - goto out_cleanup_queues; - } - rv = init_driver_queues(nullb); - if (rv) - goto out_cleanup_blk_queue; - } - - if (dev->mbps) { - set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); - nullb_setup_bwtimer(nullb); - } - - if (dev->cache_size > 0) { - set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); - } - - if (dev->zoned) { - rv = null_init_zoned_dev(dev, nullb->q); - if (rv) - goto out_cleanup_blk_queue; - } - - nullb->q->queuedata = nullb; - blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); - - mutex_lock(&lock); - rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); - if (rv < 0) { - mutex_unlock(&lock); - goto out_cleanup_zone; - } - nullb->index = rv; - dev->index = rv; - mutex_unlock(&lock); - - blk_queue_logical_block_size(nullb->q, dev->blocksize); - blk_queue_physical_block_size(nullb->q, dev->blocksize); - - null_config_discard(nullb); - - sprintf(nullb->disk_name, "nullb%d", nullb->index); - - rv = null_gendisk_register(nullb); - if (rv) - goto out_ida_free; - - mutex_lock(&lock); - list_add_tail(&nullb->list, &nullb_list); - mutex_unlock(&lock); - - return 0; - -out_ida_free: - ida_free(&nullb_indexes, nullb->index); -out_cleanup_zone: - null_free_zoned_dev(dev); -out_cleanup_blk_queue: - blk_cleanup_queue(nullb->q); -out_cleanup_tags: - if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) - blk_mq_free_tag_set(nullb->tag_set); -out_cleanup_queues: - cleanup_queues(nullb); -out_free_nullb: - kfree(nullb); - dev->nullb = NULL; -out: - return rv; -} - -static int __init null_init(void) -{ - int ret = 0; - unsigned int i; - struct nullb *nullb; - struct nullb_device *dev; - - if (g_bs > PAGE_SIZE) { - pr_warn("invalid block size\n"); - pr_warn("defaults block size to %lu\n", PAGE_SIZE); - g_bs = PAGE_SIZE; - } - - if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { - pr_err("invalid home_node value\n"); - g_home_node = NUMA_NO_NODE; - } - - if (g_queue_mode == NULL_Q_RQ) { - pr_err("legacy IO path no longer available\n"); - return -EINVAL; - } - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { - if (g_submit_queues != nr_online_nodes) { - pr_warn("submit_queues param is set to %u.\n", - nr_online_nodes); - g_submit_queues = nr_online_nodes; - } - } else if (g_submit_queues > nr_cpu_ids) - g_submit_queues = nr_cpu_ids; - else if (g_submit_queues <= 0) - g_submit_queues = 1; - - if (g_queue_mode == NULL_Q_MQ && shared_tags) { - ret = null_init_tag_set(NULL, &tag_set); - if (ret) - return ret; - } - - config_group_init(&nullb_subsys.su_group); - mutex_init(&nullb_subsys.su_mutex); - - ret = configfs_register_subsystem(&nullb_subsys); - if (ret) - goto err_tagset; - - mutex_init(&lock); - - null_major = register_blkdev(0, "nullb"); - if (null_major < 0) { - ret = null_major; - goto err_conf; - } - - for (i = 0; i < nr_devices; i++) { - dev = null_alloc_dev(); - if (!dev) { - ret = -ENOMEM; - goto err_dev; - } - ret = null_add_dev(dev); - if (ret) { - null_free_dev(dev); - goto err_dev; - } - } - - pr_info("module loaded\n"); - return 0; - -err_dev: - while (!list_empty(&nullb_list)) { - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); - } - unregister_blkdev(null_major, "nullb"); -err_conf: - configfs_unregister_subsystem(&nullb_subsys); -err_tagset: - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); - return ret; -} - -static void __exit null_exit(void) -{ - struct nullb *nullb; - - configfs_unregister_subsystem(&nullb_subsys); - - unregister_blkdev(null_major, "nullb"); - - mutex_lock(&lock); - while (!list_empty(&nullb_list)) { - struct nullb_device *dev; - - nullb = list_entry(nullb_list.next, struct nullb, list); - dev = nullb->dev; - null_del_dev(nullb); - null_free_dev(dev); - } - mutex_unlock(&lock); - - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); -} - -module_init(null_init); -module_exit(null_exit); - -MODULE_AUTHOR("Jens Axboe <axboe@xxxxxxxxx>"); -MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk_trace.c b/drivers/block/null_blk_trace.c deleted file mode 100644 index f246e7bff698..000000000000 --- a/drivers/block/null_blk_trace.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * null_blk trace related helpers. - * - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ -#include "null_blk_trace.h" - -/* - * Helper to use for all null_blk traces to extract disk name. - */ -const char *nullb_trace_disk_name(struct trace_seq *p, char *name) -{ - const char *ret = trace_seq_buffer_ptr(p); - - if (name && *name) - trace_seq_printf(p, "disk=%s, ", name); - trace_seq_putc(p, 0); - - return ret; -} diff --git a/drivers/block/null_blk_trace.h b/drivers/block/null_blk_trace.h deleted file mode 100644 index 4f83032eb544..000000000000 --- a/drivers/block/null_blk_trace.h +++ /dev/null @@ -1,79 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * null_blk device driver tracepoints. - * - * Copyright (C) 2020 Western Digital Corporation or its affiliates. - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM nullb - -#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_NULLB_H - -#include <linux/tracepoint.h> -#include <linux/trace_seq.h> - -#include "null_blk.h" - -const char *nullb_trace_disk_name(struct trace_seq *p, char *name); - -#define __print_disk_name(name) nullb_trace_disk_name(p, name) - -#ifndef TRACE_HEADER_MULTI_READ -static inline void __assign_disk_name(char *name, struct gendisk *disk) -{ - if (disk) - memcpy(name, disk->disk_name, DISK_NAME_LEN); - else - memset(name, 0, DISK_NAME_LEN); -} -#endif - -TRACE_EVENT(nullb_zone_op, - TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no, - unsigned int zone_cond), - TP_ARGS(cmd, zone_no, zone_cond), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(enum req_opf, op) - __field(unsigned int, zone_no) - __field(unsigned int, zone_cond) - ), - TP_fast_assign( - __entry->op = req_op(cmd->rq); - __entry->zone_no = zone_no; - __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); - ), - TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", - __print_disk_name(__entry->disk), - blk_op_str(__entry->op), - __entry->zone_no, - blk_zone_cond_str(__entry->zone_cond)) -); - -TRACE_EVENT(nullb_report_zones, - TP_PROTO(struct nullb *nullb, unsigned int nr_zones), - TP_ARGS(nullb, nr_zones), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(unsigned int, nr_zones) - ), - TP_fast_assign( - __entry->nr_zones = nr_zones; - __assign_disk_name(__entry->disk, nullb->disk); - ), - TP_printk("%s nr_zones=%u", - __print_disk_name(__entry->disk), __entry->nr_zones) -); - -#endif /* _TRACE_NULLB_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE null_blk_trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c deleted file mode 100644 index f5df82c26c16..000000000000 --- a/drivers/block/null_blk_zoned.c +++ /dev/null @@ -1,617 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/vmalloc.h> -#include <linux/bitmap.h> -#include "null_blk.h" - -#define CREATE_TRACE_POINTS -#include "null_blk_trace.h" - -#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) - -static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) -{ - return sect >> ilog2(dev->zone_size_sects); -} - -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) -{ - sector_t dev_capacity_sects, zone_capacity_sects; - sector_t sector = 0; - unsigned int i; - - if (!is_power_of_2(dev->zone_size)) { - pr_err("zone_size must be power-of-two\n"); - return -EINVAL; - } - if (dev->zone_size > dev->size) { - pr_err("Zone size larger than device capacity\n"); - return -EINVAL; - } - - if (!dev->zone_capacity) - dev->zone_capacity = dev->zone_size; - - if (dev->zone_capacity > dev->zone_size) { - pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", - dev->zone_capacity, dev->zone_size); - return -EINVAL; - } - - zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); - dev_capacity_sects = MB_TO_SECTS(dev->size); - dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); - dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); - if (dev_capacity_sects & (dev->zone_size_sects - 1)) - dev->nr_zones++; - - dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), - GFP_KERNEL | __GFP_ZERO); - if (!dev->zones) - return -ENOMEM; - - /* - * With memory backing, the zone_lock spinlock needs to be temporarily - * released to avoid scheduling in atomic context. To guarantee zone - * information protection, use a bitmap to lock zones with - * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing - * implies that the queue is marked with BLK_MQ_F_BLOCKING. - */ - spin_lock_init(&dev->zone_lock); - if (dev->memory_backed) { - dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); - if (!dev->zone_locks) { - kvfree(dev->zones); - return -ENOMEM; - } - } - - if (dev->zone_nr_conv >= dev->nr_zones) { - dev->zone_nr_conv = dev->nr_zones - 1; - pr_info("changed the number of conventional zones to %u", - dev->zone_nr_conv); - } - - /* Max active zones has to be < nbr of seq zones in order to be enforceable */ - if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { - dev->zone_max_active = 0; - pr_info("zone_max_active limit disabled, limit >= zone count\n"); - } - - /* Max open zones has to be <= max active zones */ - if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { - dev->zone_max_open = dev->zone_max_active; - pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); - } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { - dev->zone_max_open = 0; - pr_info("zone_max_open limit disabled, limit >= zone count\n"); - } - - for (i = 0; i < dev->zone_nr_conv; i++) { - struct blk_zone *zone = &dev->zones[i]; - - zone->start = sector; - zone->len = dev->zone_size_sects; - zone->capacity = zone->len; - zone->wp = zone->start + zone->len; - zone->type = BLK_ZONE_TYPE_CONVENTIONAL; - zone->cond = BLK_ZONE_COND_NOT_WP; - - sector += dev->zone_size_sects; - } - - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - struct blk_zone *zone = &dev->zones[i]; - - zone->start = zone->wp = sector; - if (zone->start + dev->zone_size_sects > dev_capacity_sects) - zone->len = dev_capacity_sects - zone->start; - else - zone->len = dev->zone_size_sects; - zone->capacity = - min_t(sector_t, zone->len, zone_capacity_sects); - zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; - zone->cond = BLK_ZONE_COND_EMPTY; - - sector += dev->zone_size_sects; - } - - q->limits.zoned = BLK_ZONED_HM; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - - return 0; -} - -int null_register_zoned_dev(struct nullb *nullb) -{ - struct nullb_device *dev = nullb->dev; - struct request_queue *q = nullb->q; - - if (queue_is_mq(q)) { - int ret = blk_revalidate_disk_zones(nullb->disk, NULL); - - if (ret) - return ret; - } else { - blk_queue_chunk_sectors(q, dev->zone_size_sects); - q->nr_zones = blkdev_nr_zones(nullb->disk); - } - - blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - blk_queue_max_open_zones(q, dev->zone_max_open); - blk_queue_max_active_zones(q, dev->zone_max_active); - - return 0; -} - -void null_free_zoned_dev(struct nullb_device *dev) -{ - bitmap_free(dev->zone_locks); - kvfree(dev->zones); - dev->zones = NULL; -} - -static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) -{ - if (dev->memory_backed) - wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&dev->zone_lock); -} - -static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) -{ - spin_unlock_irq(&dev->zone_lock); - - if (dev->memory_backed) - clear_and_wake_up_bit(zno, dev->zone_locks); -} - -int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) -{ - struct nullb *nullb = disk->private_data; - struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i, zno; - struct blk_zone zone; - int error; - - first_zone = null_zone_no(dev, sector); - if (first_zone >= dev->nr_zones) - return 0; - - nr_zones = min(nr_zones, dev->nr_zones - first_zone); - trace_nullb_report_zones(nullb, nr_zones); - - zno = first_zone; - for (i = 0; i < nr_zones; i++, zno++) { - /* - * Stacked DM target drivers will remap the zone information by - * modifying the zone information passed to the report callback. - * So use a local copy to avoid corruption of the device zone - * array. - */ - null_lock_zone(dev, zno); - memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); - null_unlock_zone(dev, zno); - - error = cb(&zone, i, data); - if (error) - return error; - } - - return nr_zones; -} - -/* - * This is called in the case of memory backing from null_process_cmd() - * with the target zone already locked. - */ -size_t null_zone_valid_read_len(struct nullb *nullb, - sector_t sector, unsigned int len) -{ - struct nullb_device *dev = nullb->dev; - struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; - unsigned int nr_sectors = len >> SECTOR_SHIFT; - - /* Read must be below the write pointer position */ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || - sector + nr_sectors <= zone->wp) - return len; - - if (sector > zone->wp) - return 0; - - return (zone->wp - sector) << SECTOR_SHIFT; -} - -static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) -{ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - switch (zone->cond) { - case BLK_ZONE_COND_CLOSED: - /* close operation on closed is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - if (zone->wp == zone->start) { - zone->cond = BLK_ZONE_COND_EMPTY; - } else { - zone->cond = BLK_ZONE_COND_CLOSED; - dev->nr_zones_closed++; - } - - return BLK_STS_OK; -} - -static void null_close_first_imp_zone(struct nullb_device *dev) -{ - unsigned int i; - - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - null_close_zone(dev, &dev->zones[i]); - return; - } - } -} - -static blk_status_t null_check_active(struct nullb_device *dev) -{ - if (!dev->zone_max_active) - return BLK_STS_OK; - - if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + - dev->nr_zones_closed < dev->zone_max_active) - return BLK_STS_OK; - - return BLK_STS_ZONE_ACTIVE_RESOURCE; -} - -static blk_status_t null_check_open(struct nullb_device *dev) -{ - if (!dev->zone_max_open) - return BLK_STS_OK; - - if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) - return BLK_STS_OK; - - if (dev->nr_zones_imp_open) { - if (null_check_active(dev) == BLK_STS_OK) { - null_close_first_imp_zone(dev); - return BLK_STS_OK; - } - } - - return BLK_STS_ZONE_OPEN_RESOURCE; -} - -/* - * This function matches the manage open zone resources function in the ZBC standard, - * with the addition of max active zones support (added in the ZNS standard). - * - * The function determines if a zone can transition to implicit open or explicit open, - * while maintaining the max open zone (and max active zone) limit(s). It may close an - * implicit open zone in order to make additional zone resources available. - * - * ZBC states that an implicit open zone shall be closed only if there is not - * room within the open limit. However, with the addition of an active limit, - * it is not certain that closing an implicit open zone will allow a new zone - * to be opened, since we might already be at the active limit capacity. - */ -static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) -{ - blk_status_t ret; - - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - ret = null_check_active(dev); - if (ret != BLK_STS_OK) - return ret; - fallthrough; - case BLK_ZONE_COND_CLOSED: - return null_check_open(dev); - default: - /* Should never be called for other states */ - WARN_ON(1); - return BLK_STS_IOERR; - } -} - -static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors, bool append) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zno]; - blk_status_t ret; - - trace_nullb_zone_op(cmd, zno, zone->cond); - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { - if (append) - return BLK_STS_IOERR; - return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - } - - null_lock_zone(dev, zno); - - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* Cannot write to a full zone */ - ret = BLK_STS_IOERR; - goto unlock; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - break; - default: - /* Invalid zone condition */ - ret = BLK_STS_IOERR; - goto unlock; - } - - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. - */ - if (append) { - sector = zone->wp; - if (cmd->bio) - cmd->bio->bi_iter.bi_sector = sector; - else - cmd->rq->__sector = sector; - } else if (sector != zone->wp) { - ret = BLK_STS_IOERR; - goto unlock; - } - - if (zone->wp + nr_sectors > zone->start + zone->capacity) { - ret = BLK_STS_IOERR; - goto unlock; - } - - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; - } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - /* - * Memory backing allocation may sleep: release the zone_lock spinlock - * to avoid scheduling in atomic context. Zone operation atomicity is - * still guaranteed through the zone_locks bitmap. - */ - if (dev->memory_backed) - spin_unlock_irq(&dev->zone_lock); - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (dev->memory_backed) - spin_lock_irq(&dev->zone_lock); - - if (ret != BLK_STS_OK) - goto unlock; - - zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->capacity) { - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) - dev->nr_zones_exp_open--; - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) - dev->nr_zones_imp_open--; - zone->cond = BLK_ZONE_COND_FULL; - } - ret = BLK_STS_OK; - -unlock: - null_unlock_zone(dev, zno); - - return ret; -} - -static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) -{ - blk_status_t ret; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - switch (zone->cond) { - case BLK_ZONE_COND_EXP_OPEN: - /* open operation on exp open is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - return ret; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - return ret; - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - zone->cond = BLK_ZONE_COND_EXP_OPEN; - dev->nr_zones_exp_open++; - - return BLK_STS_OK; -} - -static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) -{ - blk_status_t ret; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* finish operation on full is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - return ret; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - return ret; - dev->nr_zones_closed--; - break; - default: - return BLK_STS_IOERR; - } - - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zone->len; - - return BLK_STS_OK; -} - -static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) -{ - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - /* reset operation on empty is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - break; - default: - return BLK_STS_IOERR; - } - - zone->cond = BLK_ZONE_COND_EMPTY; - zone->wp = zone->start; - - return BLK_STS_OK; -} - -static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zone_no; - struct blk_zone *zone; - blk_status_t ret; - size_t i; - - if (op == REQ_OP_ZONE_RESET_ALL) { - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - null_lock_zone(dev, i); - zone = &dev->zones[i]; - if (zone->cond != BLK_ZONE_COND_EMPTY) { - null_reset_zone(dev, zone); - trace_nullb_zone_op(cmd, i, zone->cond); - } - null_unlock_zone(dev, i); - } - return BLK_STS_OK; - } - - zone_no = null_zone_no(dev, sector); - zone = &dev->zones[zone_no]; - - null_lock_zone(dev, zone_no); - - switch (op) { - case REQ_OP_ZONE_RESET: - ret = null_reset_zone(dev, zone); - break; - case REQ_OP_ZONE_OPEN: - ret = null_open_zone(dev, zone); - break; - case REQ_OP_ZONE_CLOSE: - ret = null_close_zone(dev, zone); - break; - case REQ_OP_ZONE_FINISH: - ret = null_finish_zone(dev, zone); - break; - default: - ret = BLK_STS_NOTSUPP; - break; - } - - if (ret == BLK_STS_OK) - trace_nullb_zone_op(cmd, zone_no, zone->cond); - - null_unlock_zone(dev, zone_no); - - return ret; -} - -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, - sector_t sector, sector_t nr_sectors) -{ - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); - blk_status_t sts; - - switch (op) { - case REQ_OP_WRITE: - sts = null_zone_write(cmd, sector, nr_sectors, false); - break; - case REQ_OP_ZONE_APPEND: - sts = null_zone_write(cmd, sector, nr_sectors, true); - break; - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_RESET_ALL: - case REQ_OP_ZONE_OPEN: - case REQ_OP_ZONE_CLOSE: - case REQ_OP_ZONE_FINISH: - sts = null_zone_mgmt(cmd, op, sector); - break; - default: - null_lock_zone(dev, zno); - sts = null_process_cmd(cmd, op, sector, nr_sectors); - null_unlock_zone(dev, zno); - } - - return sts; -} diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 39aeebc6837d..d9e41d3bbe71 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -984,6 +984,8 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) print_version(); hp = mdesc_grab(); + if (!hp) + return -ENODEV; err = -ENODEV; if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) { diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index c715d4681a0b..4ae49eae4586 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -79,7 +79,7 @@ config COMMON_CLK_RK808 config COMMON_CLK_HI655X tristate "Clock driver for Hi655x" if EXPERT depends on (MFD_HI655X_PMIC || COMPILE_TEST) - depends on REGMAP + select REGMAP default MFD_HI655X_PMIC help This driver supports the hi655x PMIC clock. This diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index 4a031c62f92a..5098639d41f1 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -182,7 +182,8 @@ static void psci_pd_remove(void) struct psci_pd_provider *pd_provider, *it; struct generic_pm_domain *genpd; - list_for_each_entry_safe(pd_provider, it, &psci_pd_providers, link) { + list_for_each_entry_safe_reverse(pd_provider, it, + &psci_pd_providers, link) { of_genpd_del_provider(pd_provider->node); genpd = of_genpd_remove_last(pd_provider->node); diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 9e6504592646..300ba2991936 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -171,7 +171,7 @@ static int zynqmp_pm_feature(u32 api_id) } /* Add new entry if not present */ - feature_data = kmalloc(sizeof(*feature_data), GFP_KERNEL); + feature_data = kmalloc(sizeof(*feature_data), GFP_ATOMIC); if (!feature_data) return -ENOMEM; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 159be13ef20b..2c19b3775179 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -528,16 +528,13 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) struct kfd_event_waiter *event_waiters; uint32_t i; - event_waiters = kmalloc_array(num_events, - sizeof(struct kfd_event_waiter), - GFP_KERNEL); + event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), + GFP_KERNEL); if (!event_waiters) return NULL; - for (i = 0; (event_waiters) && (i < num_events) ; i++) { + for (i = 0; i < num_events; i++) init_wait(&event_waiters[i].wait); - event_waiters[i].activated = false; - } return event_waiters; } diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c index e427f4ffa080..e5b1002d7f3f 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn30/display_mode_vba_30.c @@ -1868,7 +1868,10 @@ static unsigned int CalculateVMAndRowBytes( } if (SurfaceTiling == dm_sw_linear) { - *dpte_row_height = dml_min(128, 1 << (unsigned int) dml_floor(dml_log2(PTEBufferSizeInRequests * *PixelPTEReqWidth / Pitch), 1)); + if (PTEBufferSizeInRequests == 0) + *dpte_row_height = 1; + else + *dpte_row_height = dml_min(128, 1 << (unsigned int) dml_floor(dml_log2(PTEBufferSizeInRequests * *PixelPTEReqWidth / Pitch), 1)); *dpte_row_width_ub = (dml_ceil(((double) SwathWidth - 1) / *PixelPTEReqWidth, 1) + 1) * *PixelPTEReqWidth; *PixelPTEBytesPerRow = *dpte_row_width_ub / *PixelPTEReqWidth * *PTERequestSize; } else if (ScanDirection != dm_vert) { diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index c56656a95cf9..b7bb5610dfe2 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -614,11 +614,14 @@ int drm_gem_shmem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) int ret; if (obj->import_attach) { - /* Drop the reference drm_gem_mmap_obj() acquired.*/ - drm_gem_object_put(obj); vma->vm_private_data = NULL; + ret = dma_buf_mmap(obj->dma_buf, vma, 0); + + /* Drop the reference drm_gem_mmap_obj() acquired.*/ + if (!ret) + drm_gem_object_put(obj); - return dma_buf_mmap(obj->dma_buf, vma, 0); + return ret; } shmem = to_drm_gem_shmem_obj(obj); diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c index 69b2e5509d67..de67b2745258 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.c +++ b/drivers/gpu/drm/i915/gt/intel_ring.c @@ -108,7 +108,7 @@ static struct i915_vma *create_ring_vma(struct i915_ggtt *ggtt, int size) struct i915_vma *vma; obj = ERR_PTR(-ENODEV); - if (i915_ggtt_has_aperture(ggtt)) + if (i915_ggtt_has_aperture(ggtt) && !HAS_LLC(i915)) obj = i915_gem_object_create_stolen(i915, size); if (IS_ERR(obj)) obj = i915_gem_object_create_internal(i915, size); diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index c4c2d24dc509..0532a5069c04 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -432,8 +432,7 @@ replace_barrier(struct i915_active *ref, struct i915_active_fence *active) * we can use it to substitute for the pending idle-barrer * request that we want to emit on the kernel_context. */ - __active_del_barrier(ref, node_from_active(active)); - return true; + return __active_del_barrier(ref, node_from_active(active)); } int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) @@ -446,16 +445,19 @@ int i915_active_ref(struct i915_active *ref, u64 idx, struct dma_fence *fence) if (err) return err; - active = active_instance(ref, idx); - if (!active) { - err = -ENOMEM; - goto out; - } + do { + active = active_instance(ref, idx); + if (!active) { + err = -ENOMEM; + goto out; + } + + if (replace_barrier(ref, active)) { + RCU_INIT_POINTER(active->fence, NULL); + atomic_dec(&ref->count); + } + } while (unlikely(is_barrier(active))); - if (replace_barrier(ref, active)) { - RCU_INIT_POINTER(active->fence, NULL); - atomic_dec(&ref->count); - } if (!__i915_active_fence_set(active, fence)) __i915_active_acquire(ref); diff --git a/drivers/gpu/drm/meson/meson_vpp.c b/drivers/gpu/drm/meson/meson_vpp.c index 154837688ab0..5df1957c8e41 100644 --- a/drivers/gpu/drm/meson/meson_vpp.c +++ b/drivers/gpu/drm/meson/meson_vpp.c @@ -100,6 +100,8 @@ void meson_vpp_init(struct meson_drm *priv) priv->io_base + _REG(VPP_DOLBY_CTRL)); writel_relaxed(0x1020080, priv->io_base + _REG(VPP_DUMMY_DATA1)); + writel_relaxed(0x42020, + priv->io_base + _REG(VPP_DUMMY_DATA)); } else if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_G12A)) writel_relaxed(0xf, priv->io_base + _REG(DOLBY_PATH_CTRL)); diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 13596961ae17..5ff856ef7d88 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -236,7 +236,7 @@ static void panfrost_mmu_flush_range(struct panfrost_device *pfdev, if (pm_runtime_active(pfdev->dev)) mmu_hw_do_operation(pfdev, mmu, iova, size, AS_COMMAND_FLUSH_PT); - pm_runtime_put_sync_autosuspend(pfdev->dev); + pm_runtime_put_autosuspend(pfdev->dev); } static int mmu_map_sg(struct panfrost_device *pfdev, struct panfrost_mmu *mmu, diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 5f9ec1d1464a..524d6d712e72 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -258,6 +258,7 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign { struct hid_report *report; struct hid_field *field; + unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; unsigned int usages; unsigned int offset; unsigned int i; @@ -288,8 +289,11 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign offset = report->size; report->size += parser->global.report_size * parser->global.report_count; + if (parser->device->ll_driver->max_buffer_size) + max_buffer_size = parser->device->ll_driver->max_buffer_size; + /* Total size check: Allow for possible report index byte */ - if (report->size > (HID_MAX_BUFFER_SIZE - 1) << 3) { + if (report->size > (max_buffer_size - 1) << 3) { hid_err(parser->device, "report is too long\n"); return -1; } @@ -1752,6 +1756,7 @@ int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; + int max_buffer_size = HID_MAX_BUFFER_SIZE; unsigned int a; u32 rsize, csize = size; u8 *cdata = data; @@ -1768,10 +1773,13 @@ int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size, rsize = hid_compute_report_size(report); - if (report_enum->numbered && rsize >= HID_MAX_BUFFER_SIZE) - rsize = HID_MAX_BUFFER_SIZE - 1; - else if (rsize > HID_MAX_BUFFER_SIZE) - rsize = HID_MAX_BUFFER_SIZE; + if (hid->ll_driver->max_buffer_size) + max_buffer_size = hid->ll_driver->max_buffer_size; + + if (report_enum->numbered && rsize >= max_buffer_size) + rsize = max_buffer_size - 1; + else if (rsize > max_buffer_size) + rsize = max_buffer_size; if (csize < rsize) { dbg_hid("report %d is too short, (%d < %d)\n", report->id, diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index fc06d8bb42e0..ba0ca652b9da 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -395,6 +395,7 @@ struct hid_ll_driver uhid_hid_driver = { .parse = uhid_hid_parse, .raw_request = uhid_hid_raw_request, .output_report = uhid_hid_output_report, + .max_buffer_size = UHID_DATA_MAX, }; EXPORT_SYMBOL_GPL(uhid_hid_driver); diff --git a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c index 9d5b019651f2..6b84822e7d93 100644 --- a/drivers/hwmon/adt7475.c +++ b/drivers/hwmon/adt7475.c @@ -486,10 +486,10 @@ static ssize_t temp_store(struct device *dev, struct device_attribute *attr, val = (temp - val) / 1000; if (sattr->index != 1) { - data->temp[HYSTERSIS][sattr->index] &= 0xF0; + data->temp[HYSTERSIS][sattr->index] &= 0x0F; data->temp[HYSTERSIS][sattr->index] |= (val & 0xF) << 4; } else { - data->temp[HYSTERSIS][sattr->index] &= 0x0F; + data->temp[HYSTERSIS][sattr->index] &= 0xF0; data->temp[HYSTERSIS][sattr->index] |= (val & 0xF); } @@ -554,11 +554,11 @@ static ssize_t temp_st_show(struct device *dev, struct device_attribute *attr, val = data->enh_acoustics[0] & 0xf; break; case 1: - val = (data->enh_acoustics[1] >> 4) & 0xf; + val = data->enh_acoustics[1] & 0xf; break; case 2: default: - val = data->enh_acoustics[1] & 0xf; + val = (data->enh_acoustics[1] >> 4) & 0xf; break; } diff --git a/drivers/hwmon/ina3221.c b/drivers/hwmon/ina3221.c index d3c98115042b..836e7579e166 100644 --- a/drivers/hwmon/ina3221.c +++ b/drivers/hwmon/ina3221.c @@ -772,7 +772,7 @@ static int ina3221_probe_child_from_dt(struct device *dev, return ret; } else if (val > INA3221_CHANNEL3) { dev_err(dev, "invalid reg %d of %pOFn\n", val, child); - return ret; + return -EINVAL; } input = &ina->inputs[val]; diff --git a/drivers/hwmon/pmbus/adm1266.c b/drivers/hwmon/pmbus/adm1266.c index c7b373ba92f2..d1b2e936546f 100644 --- a/drivers/hwmon/pmbus/adm1266.c +++ b/drivers/hwmon/pmbus/adm1266.c @@ -301,6 +301,7 @@ static int adm1266_config_gpio(struct adm1266_data *data) data->gc.label = name; data->gc.parent = &data->client->dev; data->gc.owner = THIS_MODULE; + data->gc.can_sleep = true; data->gc.base = -1; data->gc.names = data->gpio_names; data->gc.ngpio = ARRAY_SIZE(data->gpio_names); diff --git a/drivers/hwmon/pmbus/ucd9000.c b/drivers/hwmon/pmbus/ucd9000.c index f8017993e2b4..9e26cc084a17 100644 --- a/drivers/hwmon/pmbus/ucd9000.c +++ b/drivers/hwmon/pmbus/ucd9000.c @@ -7,6 +7,7 @@ */ #include <linux/debugfs.h> +#include <linux/delay.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/of_device.h> @@ -16,6 +17,7 @@ #include <linux/i2c.h> #include <linux/pmbus.h> #include <linux/gpio/driver.h> +#include <linux/timekeeping.h> #include "pmbus.h" enum chips { ucd9000, ucd90120, ucd90124, ucd90160, ucd90320, ucd9090, @@ -65,6 +67,7 @@ struct ucd9000_data { struct gpio_chip gpio; #endif struct dentry *debugfs; + ktime_t write_time; }; #define to_ucd9000_data(_info) container_of(_info, struct ucd9000_data, info) @@ -73,6 +76,73 @@ struct ucd9000_debugfs_entry { u8 index; }; +/* + * It has been observed that the UCD90320 randomly fails register access when + * doing another access right on the back of a register write. To mitigate this + * make sure that there is a minimum delay between a write access and the + * following access. The 250us is based on experimental data. At a delay of + * 200us the issue seems to go away. Add a bit of extra margin to allow for + * system to system differences. + */ +#define UCD90320_WAIT_DELAY_US 250 + +static inline void ucd90320_wait(const struct ucd9000_data *data) +{ + s64 delta = ktime_us_delta(ktime_get(), data->write_time); + + if (delta < UCD90320_WAIT_DELAY_US) + udelay(UCD90320_WAIT_DELAY_US - delta); +} + +static int ucd90320_read_word_data(struct i2c_client *client, int page, + int phase, int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct ucd9000_data *data = to_ucd9000_data(info); + + if (reg >= PMBUS_VIRT_BASE) + return -ENXIO; + + ucd90320_wait(data); + return pmbus_read_word_data(client, page, phase, reg); +} + +static int ucd90320_read_byte_data(struct i2c_client *client, int page, int reg) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct ucd9000_data *data = to_ucd9000_data(info); + + ucd90320_wait(data); + return pmbus_read_byte_data(client, page, reg); +} + +static int ucd90320_write_word_data(struct i2c_client *client, int page, + int reg, u16 word) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct ucd9000_data *data = to_ucd9000_data(info); + int ret; + + ucd90320_wait(data); + ret = pmbus_write_word_data(client, page, reg, word); + data->write_time = ktime_get(); + + return ret; +} + +static int ucd90320_write_byte(struct i2c_client *client, int page, u8 value) +{ + const struct pmbus_driver_info *info = pmbus_get_driver_info(client); + struct ucd9000_data *data = to_ucd9000_data(info); + int ret; + + ucd90320_wait(data); + ret = pmbus_write_byte(client, page, value); + data->write_time = ktime_get(); + + return ret; +} + static int ucd9000_get_fan_config(struct i2c_client *client, int fan) { int fan_config = 0; @@ -598,6 +668,11 @@ static int ucd9000_probe(struct i2c_client *client) info->read_byte_data = ucd9000_read_byte_data; info->func[0] |= PMBUS_HAVE_FAN12 | PMBUS_HAVE_STATUS_FAN12 | PMBUS_HAVE_FAN34 | PMBUS_HAVE_STATUS_FAN34; + } else if (mid->driver_data == ucd90320) { + info->read_byte_data = ucd90320_read_byte_data; + info->read_word_data = ucd90320_read_word_data; + info->write_byte = ucd90320_write_byte; + info->write_word_data = ucd90320_write_word_data; } ucd9000_probe_gpio(client, mid, data); diff --git a/drivers/hwmon/tmp513.c b/drivers/hwmon/tmp513.c index 47bbe47e062f..7d5f7441aceb 100644 --- a/drivers/hwmon/tmp513.c +++ b/drivers/hwmon/tmp513.c @@ -758,7 +758,7 @@ static int tmp51x_probe(struct i2c_client *client) static struct i2c_driver tmp51x_driver = { .driver = { .name = "tmp51x", - .of_match_table = of_match_ptr(tmp51x_of_match), + .of_match_table = tmp51x_of_match, }, .probe_new = tmp51x_probe, .id_table = tmp51x_id, diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index f2a5af239c95..f5d3cf86753f 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -768,6 +768,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev) { struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev); + cancel_work_sync(&ctx->workq); hwmon_device_unregister(ctx->hwmon_dev); kfifo_free(&ctx->async_msg_fifo); if (acpi_disabled) diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c index ceb6cdc20484..7db6d0fc6ec2 100644 --- a/drivers/interconnect/core.c +++ b/drivers/interconnect/core.c @@ -850,6 +850,10 @@ void icc_node_destroy(int id) mutex_unlock(&icc_lock); + if (!node) + return; + + kfree(node->links); kfree(node); } EXPORT_SYMBOL_GPL(icc_node_destroy); diff --git a/drivers/media/i2c/m5mols/m5mols_core.c b/drivers/media/i2c/m5mols/m5mols_core.c index 21666d705e37..dcf9e4d4ee6b 100644 --- a/drivers/media/i2c/m5mols/m5mols_core.c +++ b/drivers/media/i2c/m5mols/m5mols_core.c @@ -488,7 +488,7 @@ static enum m5mols_restype __find_restype(u32 code) do { if (code == m5mols_default_ffmt[type].code) return type; - } while (type++ != SIZE_DEFAULT_FFMT); + } while (++type != SIZE_DEFAULT_FFMT); return 0; } diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index af85b32c6c1c..c468f9a02ef6 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -1818,7 +1818,6 @@ static void atmci_tasklet_func(unsigned long priv) atmci_writel(host, ATMCI_IER, ATMCI_NOTBUSY); state = STATE_WAITING_NOTBUSY; } else if (host->mrq->stop) { - atmci_writel(host, ATMCI_IER, ATMCI_CMDRDY); atmci_send_stop_cmd(host, data); state = STATE_SENDING_STOP; } else { @@ -1851,8 +1850,6 @@ static void atmci_tasklet_func(unsigned long priv) * command to send. */ if (host->mrq->stop) { - atmci_writel(host, ATMCI_IER, - ATMCI_CMDRDY); atmci_send_stop_cmd(host, data); state = STATE_SENDING_STOP; } else { diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 24cd6d3dc647..bf2592774165 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -369,7 +369,7 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) MAX_POWER_ON_TIMEOUT, false, host, val, reg); if (ret) - dev_warn(mmc_dev(host->mmc), "Power on failed\n"); + dev_info(mmc_dev(host->mmc), "Power on failed\n"); } } diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 371b345635e6..a253476a52b0 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2734,7 +2734,7 @@ static int mv88e6xxx_get_max_mtu(struct dsa_switch *ds, int port) return 10240 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; else if (chip->info->ops->set_max_frame_size) return 1632 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; - return 1522 - VLAN_ETH_HLEN - EDSA_HLEN - ETH_FCS_LEN; + return ETH_DATA_LEN; } static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) @@ -2742,6 +2742,17 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) struct mv88e6xxx_chip *chip = ds->priv; int ret = 0; + /* For families where we don't know how to alter the MTU, + * just accept any value up to ETH_DATA_LEN + */ + if (!chip->info->ops->port_set_jumbo_size && + !chip->info->ops->set_max_frame_size) { + if (new_mtu > ETH_DATA_LEN) + return -EINVAL; + + return 0; + } + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) new_mtu += EDSA_HLEN; @@ -2750,9 +2761,6 @@ static int mv88e6xxx_change_mtu(struct dsa_switch *ds, int port, int new_mtu) ret = chip->info->ops->port_set_jumbo_size(chip, port, new_mtu); else if (chip->info->ops->set_max_frame_size) ret = chip->info->ops->set_max_frame_size(chip, new_mtu); - else - if (new_mtu > 1522) - ret = -EINVAL; mv88e6xxx_reg_unlock(chip); return ret; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 9e8a20a94862..76481ff7074b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -14851,6 +14851,7 @@ static int i40e_init_recovery_mode(struct i40e_pf *pf, struct i40e_hw *hw) int err; int v_idx; + pci_set_drvdata(pf->pdev, pf); pci_save_state(pf->pdev); /* set up periodic task facility */ diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 59963b901be0..e0790df700e2 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -169,8 +169,6 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) } netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx)); - ice_qvec_dis_irq(vsi, rx_ring, q_vector); - ice_fill_txq_meta(vsi, tx_ring, &txq_meta); err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta); if (err) @@ -185,6 +183,8 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx) if (err) return err; } + ice_qvec_dis_irq(vsi, rx_ring, q_vector); + err = ice_vsi_ctrl_one_rx_ring(vsi, false, q_idx, true); if (err) return err; diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index d2f5855b2ea7..895b6f0a3984 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -4986,6 +4986,11 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn, num_vports = p_hwfn->qm_info.num_vports; + if (num_vports < 2) { + DP_NOTICE(p_hwfn, "Unexpected num_vports: %d\n", num_vports); + return -EINVAL; + } + /* Accounting for the vports which are configured for WFQ explicitly */ for (i = 0; i < num_vports; i++) { u32 tmp_speed; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c index 3e3192a3ad9b..fdbd5f07a185 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c @@ -422,7 +422,7 @@ qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time, if (p_time->hour > 23) p_time->hour = 0; if (p_time->min > 59) - p_time->hour = 0; + p_time->min = 0; if (p_time->msec > 999) p_time->msec = 0; if (p_time->usec > 999) diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index 01ea0d6f8819..934a4b54784b 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -290,6 +290,9 @@ static int vsw_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) hp = mdesc_grab(); + if (!hp) + return -ENODEV; + rmac = mdesc_get_property(hp, vdev->mp, remote_macaddr_prop, &len); err = -ENODEV; if (!rmac) { diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index 96b883f965f6..b6c03adf1e76 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -431,6 +431,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) hp = mdesc_grab(); + if (!hp) + return -ENODEV; + vp = vnet_find_parent(hp, vdev->mp, vdev); if (IS_ERR(vp)) { pr_err("Cannot find port parent vnet\n"); diff --git a/drivers/net/ipvlan/ipvlan_l3s.c b/drivers/net/ipvlan/ipvlan_l3s.c index 943d26cbf39f..71712ea25403 100644 --- a/drivers/net/ipvlan/ipvlan_l3s.c +++ b/drivers/net/ipvlan/ipvlan_l3s.c @@ -101,6 +101,7 @@ static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, goto out; skb->dev = addr->master->dev; + skb->skb_iif = skb->dev->ifindex; len = skb->len + ETH_HLEN; ipvlan_count_rx(addr->master, len, true, false); out: diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c index caf7291ffaf8..b67de3f9ef18 100644 --- a/drivers/net/phy/smsc.c +++ b/drivers/net/phy/smsc.c @@ -181,8 +181,11 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev) static int lan87xx_read_status(struct phy_device *phydev) { struct smsc_phy_priv *priv = phydev->priv; + int err; - int err = genphy_read_status(phydev); + err = genphy_read_status(phydev); + if (err) + return err; if (!phydev->link && priv->energy_enable) { /* Disable EDPD to wake up PHY */ diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index 378a12ae2d95..fb1389bd0939 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -2199,6 +2199,13 @@ static int smsc75xx_rx_fixup(struct usbnet *dev, struct sk_buff *skb) size = (rx_cmd_a & RX_CMD_A_LEN) - RXW_PADDING; align_count = (4 - ((size + RXW_PADDING) % 4)) % 4; + if (unlikely(size > skb->len)) { + netif_dbg(dev, rx_err, dev->net, + "size err rx_cmd_a=0x%08x\n", + rx_cmd_a); + return 0; + } + if (unlikely(rx_cmd_a & RX_CMD_A_RED)) { netif_dbg(dev, rx_err, dev->net, "Error rx_cmd_a=0x%08x\n", rx_cmd_a); diff --git a/drivers/nfc/pn533/usb.c b/drivers/nfc/pn533/usb.c index 57b07446bb76..68eb1253f888 100644 --- a/drivers/nfc/pn533/usb.c +++ b/drivers/nfc/pn533/usb.c @@ -175,6 +175,7 @@ static int pn533_usb_send_frame(struct pn533 *dev, print_hex_dump_debug("PN533 TX: ", DUMP_PREFIX_NONE, 16, 1, out->data, out->len, false); + arg.phy = phy; init_completion(&arg.done); cntx = phy->out_urb->context; phy->out_urb->context = &arg; diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c index 5d74c674368a..8ccf5a86ad1b 100644 --- a/drivers/nfc/st-nci/ndlc.c +++ b/drivers/nfc/st-nci/ndlc.c @@ -286,13 +286,15 @@ EXPORT_SYMBOL(ndlc_probe); void ndlc_remove(struct llt_ndlc *ndlc) { - st_nci_remove(ndlc->ndev); - /* cancel timers */ del_timer_sync(&ndlc->t1_timer); del_timer_sync(&ndlc->t2_timer); ndlc->t2_active = false; ndlc->t1_active = false; + /* cancel work */ + cancel_work_sync(&ndlc->sm_work); + + st_nci_remove(ndlc->ndev); skb_queue_purge(&ndlc->rcv_q); skb_queue_purge(&ndlc->send_q); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e162f1dfbafe..a4b6aa932a8f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -723,16 +723,26 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, range = page_address(ns->ctrl->discard_page); } - __rq_for_each_bio(bio, req) { - u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); - u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; - - if (n < segments) { - range[n].cattr = cpu_to_le32(0); - range[n].nlb = cpu_to_le32(nlb); - range[n].slba = cpu_to_le64(slba); + if (queue_max_discard_segments(req->q) == 1) { + u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req)); + u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9); + + range[0].cattr = cpu_to_le32(0); + range[0].nlb = cpu_to_le32(nlb); + range[0].slba = cpu_to_le64(slba); + n = 1; + } else { + __rq_for_each_bio(bio, req) { + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; + + if (n < segments) { + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + } + n++; } - n++; } if (WARN_ON_ONCE(n != segments)) { diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index bc88ff2912f5..a82a0796a614 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -749,8 +749,10 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) void nvmet_req_complete(struct nvmet_req *req, u16 status) { + struct nvmet_sq *sq = req->sq; + __nvmet_req_complete(req, status); - percpu_ref_put(&req->sq->ref); + percpu_ref_put(&sq->ref); } EXPORT_SYMBOL_GPL(nvmet_req_complete); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 8b587fc97f7b..c22cc20db1a7 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -911,7 +911,7 @@ static int pci_pm_resume_noirq(struct device *dev) pcie_pme_root_status_cleanup(pci_dev); if (!skip_bus_pm && prev_state == PCI_D3cold) - pci_bridge_wait_for_secondary_bus(pci_dev); + pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT); if (pci_has_legacy_pm_support(pci_dev)) return 0; @@ -1298,7 +1298,7 @@ static int pci_pm_runtime_resume(struct device *dev) pci_pm_default_resume(pci_dev); if (prev_state == PCI_D3cold) - pci_bridge_wait_for_secondary_bus(pci_dev); + pci_bridge_wait_for_secondary_bus(pci_dev, "resume", PCI_RESET_WAIT); if (pm && pm->runtime_resume) error = pm->runtime_resume(dev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 744a2e05635b..d37013d007b6 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -157,9 +157,6 @@ static int __init pcie_port_pm_setup(char *str) } __setup("pcie_port_pm=", pcie_port_pm_setup); -/* Time to wait after a reset for device to become responsive */ -#define PCIE_RESET_READY_POLL_MS 60000 - /** * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children * @bus: pointer to PCI bus structure to search @@ -1221,7 +1218,7 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) return -ENOTTY; } - if (delay > 1000) + if (delay > PCI_RESET_WAIT) pci_info(dev, "not ready %dms after %s; waiting\n", delay - 1, reset_type); @@ -1230,7 +1227,7 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) pci_read_config_dword(dev, PCI_COMMAND, &id); } - if (delay > 1000) + if (delay > PCI_RESET_WAIT) pci_info(dev, "ready %dms after %s\n", delay - 1, reset_type); @@ -4792,24 +4789,31 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus) /** * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible * @dev: PCI bridge + * @reset_type: reset type in human-readable form + * @timeout: maximum time to wait for devices on secondary bus (milliseconds) * * Handle necessary delays before access to the devices on the secondary - * side of the bridge are permitted after D3cold to D0 transition. + * side of the bridge are permitted after D3cold to D0 transition + * or Conventional Reset. * * For PCIe this means the delays in PCIe 5.0 section 6.6.1. For * conventional PCI it means Tpvrh + Trhfa specified in PCI 3.0 section * 4.3.2. + * + * Return 0 on success or -ENOTTY if the first device on the secondary bus + * failed to become accessible. */ -void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type, + int timeout) { struct pci_dev *child; int delay; if (pci_dev_is_disconnected(dev)) - return; + return 0; if (!pci_is_bridge(dev)) - return; + return 0; down_read(&pci_bus_sem); @@ -4821,14 +4825,14 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) */ if (!dev->subordinate || list_empty(&dev->subordinate->devices)) { up_read(&pci_bus_sem); - return; + return 0; } /* Take d3cold_delay requirements into account */ delay = pci_bus_max_d3cold_delay(dev->subordinate); if (!delay) { up_read(&pci_bus_sem); - return; + return 0; } child = list_first_entry(&dev->subordinate->devices, struct pci_dev, @@ -4837,14 +4841,12 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) /* * Conventional PCI and PCI-X we need to wait Tpvrh + Trhfa before - * accessing the device after reset (that is 1000 ms + 100 ms). In - * practice this should not be needed because we don't do power - * management for them (see pci_bridge_d3_possible()). + * accessing the device after reset (that is 1000 ms + 100 ms). */ if (!pci_is_pcie(dev)) { pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay); msleep(1000 + delay); - return; + return 0; } /* @@ -4861,11 +4863,11 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) * configuration requests if we only wait for 100 ms (see * https://bugzilla.kernel.org/show_bug.cgi?id=203885). * - * Therefore we wait for 100 ms and check for the device presence. - * If it is still not present give it an additional 100 ms. + * Therefore we wait for 100 ms and check for the device presence + * until the timeout expires. */ if (!pcie_downstream_port(dev)) - return; + return 0; if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) { pci_dbg(dev, "waiting %d ms for downstream link\n", delay); @@ -4876,14 +4878,11 @@ void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev) if (!pcie_wait_for_link_delay(dev, true, delay)) { /* Did not train, no need to wait any further */ pci_info(dev, "Data Link Layer Link Active not set in 1000 msec\n"); - return; + return -ENOTTY; } } - if (!pci_device_is_present(child)) { - pci_dbg(child, "waiting additional %d ms to become accessible\n", delay); - msleep(delay); - } + return pci_dev_wait(child, reset_type, timeout - delay); } void pci_reset_secondary_bus(struct pci_dev *dev) @@ -4902,15 +4901,6 @@ void pci_reset_secondary_bus(struct pci_dev *dev) ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); - - /* - * Trhfa for conventional PCI is 2^25 clock cycles. - * Assuming a minimum 33MHz clock this results in a 1s - * delay before we can consider subordinate devices to - * be re-initialized. PCIe has some ways to shorten this, - * but we don't make use of them yet. - */ - ssleep(1); } void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) @@ -4929,7 +4919,8 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) { pcibios_reset_secondary_bus(dev); - return pci_dev_wait(dev, "bus reset", PCIE_RESET_READY_POLL_MS); + return pci_bridge_wait_for_secondary_bus(dev, "bus reset", + PCIE_RESET_READY_POLL_MS); } EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9197d7362731..72436000ff25 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -47,6 +47,19 @@ int pci_bus_error_reset(struct pci_dev *dev); #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ +/* + * Following exit from Conventional Reset, devices must be ready within 1 sec + * (PCIe r6.0 sec 6.6.1). A D3cold to D0 transition implies a Conventional + * Reset (PCIe r6.0 sec 5.8). + */ +#define PCI_RESET_WAIT 1000 /* msec */ +/* + * Devices may extend the 1 sec period through Request Retry Status completions + * (PCIe r6.0 sec 2.3.1). The spec does not provide an upper limit, but 60 sec + * ought to be enough for any device to become responsive. + */ +#define PCIE_RESET_READY_POLL_MS 60000 /* msec */ + /** * struct pci_platform_pm_ops - Firmware PM callbacks * @@ -108,7 +121,8 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); -void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev); +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type, + int timeout); static inline void pci_wakeup_event(struct pci_dev *dev) { diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index c556e7beafe3..f21d64ae4ffc 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -170,8 +170,8 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS, PCI_EXP_DPC_STATUS_TRIGGER); - if (!pcie_wait_for_link(pdev, true)) { - pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); + if (pci_bridge_wait_for_secondary_bus(pdev, "DPC", + PCIE_RESET_READY_POLL_MS)) { clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); ret = PCI_ERS_RESULT_DISCONNECT; } else { diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index fae032324210..18321cf9db5d 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -322,10 +322,7 @@ static void scsi_host_dev_release(struct device *dev) struct Scsi_Host *shost = dev_to_shost(dev); struct device *parent = dev->parent; - /* In case scsi_remove_host() has not been called. */ - scsi_proc_hostdir_rm(shost->hostt); - - /* Wait for functions invoked through call_rcu(&shost->rcu, ...) */ + /* Wait for functions invoked through call_rcu(&scmd->rcu, ...) */ rcu_barrier(); if (shost->tmf_work_q) diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c index b58f4d9c296a..326265fd7f91 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_transport.c +++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c @@ -670,7 +670,7 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, goto out_fail; } port = sas_port_alloc_num(sas_node->parent_dev); - if ((sas_port_add(port))) { + if (!port || (sas_port_add(port))) { ioc_err(ioc, "failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); goto out_fail; @@ -695,6 +695,12 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, rphy = sas_expander_alloc(port, mpt3sas_port->remote_identify.device_type); + if (!rphy) { + ioc_err(ioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + goto out_delete_port; + } + rphy->identify = mpt3sas_port->remote_identify; if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE) { @@ -714,6 +720,7 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, __FILE__, __LINE__, __func__); sas_rphy_free(rphy); rphy = NULL; + goto out_delete_port; } if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE) { @@ -740,7 +747,10 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle, rphy_to_expander_device(rphy)); return mpt3sas_port; - out_fail: +out_delete_port: + sas_port_delete(port); + +out_fail: list_for_each_entry_safe(mpt3sas_phy, next, &mpt3sas_port->phy_list, port_siblings) list_del(&mpt3sas_phy->port_siblings); diff --git a/drivers/tty/serial/8250/8250_em.c b/drivers/tty/serial/8250/8250_em.c index f8e99995eee9..d94c3811a8f7 100644 --- a/drivers/tty/serial/8250/8250_em.c +++ b/drivers/tty/serial/8250/8250_em.c @@ -106,8 +106,8 @@ static int serial8250_em_probe(struct platform_device *pdev) memset(&up, 0, sizeof(up)); up.port.mapbase = regs->start; up.port.irq = irq; - up.port.type = PORT_UNKNOWN; - up.port.flags = UPF_BOOT_AUTOCONF | UPF_FIXED_PORT | UPF_IOREMAP; + up.port.type = PORT_16750; + up.port.flags = UPF_FIXED_PORT | UPF_IOREMAP | UPF_FIXED_TYPE; up.port.dev = &pdev->dev; up.port.private_data = priv; diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 9cb0e8673f82..32cce52800a7 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -2159,9 +2159,15 @@ lpuart32_set_termios(struct uart_port *port, struct ktermios *termios, /* update the per-port timeout */ uart_update_timeout(port, termios->c_cflag, baud); - /* wait transmit engin complete */ - lpuart32_write(&sport->port, 0, UARTMODIR); - lpuart32_wait_bit_set(&sport->port, UARTSTAT, UARTSTAT_TC); + /* + * LPUART Transmission Complete Flag may never be set while queuing a break + * character, so skip waiting for transmission complete when UARTCTRL_SBK is + * asserted. + */ + if (!(old_ctrl & UARTCTRL_SBK)) { + lpuart32_write(&sport->port, 0, UARTMODIR); + lpuart32_wait_bit_set(&sport->port, UARTSTAT, UARTSTAT_TC); + } /* disable transmit and receive */ lpuart32_write(&sport->port, old_ctrl & ~(UARTCTRL_TE | UARTCTRL_RE), diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c index 3feb6e40d56d..ef8a4c5fc687 100644 --- a/drivers/video/fbdev/stifb.c +++ b/drivers/video/fbdev/stifb.c @@ -921,6 +921,28 @@ SETUP_HCRX(struct stifb_info *fb) /* ------------------- driver specific functions --------------------------- */ +static int +stifb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) +{ + struct stifb_info *fb = container_of(info, struct stifb_info, info); + + if (var->xres != fb->info.var.xres || + var->yres != fb->info.var.yres || + var->bits_per_pixel != fb->info.var.bits_per_pixel) + return -EINVAL; + + var->xres_virtual = var->xres; + var->yres_virtual = var->yres; + var->xoffset = 0; + var->yoffset = 0; + var->grayscale = fb->info.var.grayscale; + var->red.length = fb->info.var.red.length; + var->green.length = fb->info.var.green.length; + var->blue.length = fb->info.var.blue.length; + + return 0; +} + static int stifb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, u_int transp, struct fb_info *info) @@ -1145,6 +1167,7 @@ stifb_init_display(struct stifb_info *fb) static const struct fb_ops stifb_ops = { .owner = THIS_MODULE, + .fb_check_var = stifb_check_var, .fb_setcolreg = stifb_setcolreg, .fb_blank = stifb_blank, .fb_fillrect = stifb_fillrect, @@ -1164,6 +1187,7 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) struct stifb_info *fb; struct fb_info *info; unsigned long sti_rom_address; + char modestr[32]; char *dev_name; int bpp, xres, yres; @@ -1342,6 +1366,9 @@ static int __init stifb_init_fb(struct sti_struct *sti, int bpp_pref) info->flags = FBINFO_HWACCEL_COPYAREA | FBINFO_HWACCEL_FILLRECT; info->pseudo_palette = &fb->pseudo_palette; + scnprintf(modestr, sizeof(modestr), "%dx%d-%d", xres, yres, bpp); + fb_find_mode(&info->var, info, modestr, NULL, 0, NULL, bpp); + /* This has to be done !!! */ if (fb_alloc_cmap(&info->cmap, NR_PALETTE, 0)) goto out_err1; diff --git a/fs/attr.c b/fs/attr.c index 848ffe6e3c24..326a0db3296d 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,6 +18,65 @@ #include <linux/evm.h> #include <linux/ima.h> +#include "internal.h" + +/** + * setattr_should_drop_sgid - determine whether the setgid bit needs to be + * removed + * @inode: inode to check + * + * This function determines whether the setgid bit needs to be removed. + * We retain backwards compatibility and require setgid bit to be removed + * unconditionally if S_IXGRP is set. Otherwise we have the exact same + * requirements as setattr_prepare() and setattr_copy(). + * + * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. + */ +int setattr_should_drop_sgid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + + if (!(mode & S_ISGID)) + return 0; + if (mode & S_IXGRP) + return ATTR_KILL_SGID; + if (!in_group_or_capable(inode, inode->i_gid)) + return ATTR_KILL_SGID; + return 0; +} + +/** + * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to + * be dropped + * @inode: inode to check + * + * This function determines whether the set{g,u}id bits need to be removed. + * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the + * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both + * set{g,u}id bits need to be removed the corresponding mask of both flags is + * returned. + * + * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits + * to remove, 0 otherwise. + */ +int setattr_should_drop_suidgid(struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + kill |= setattr_should_drop_sgid(inode); + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(setattr_should_drop_suidgid); + static bool chown_ok(const struct inode *inode, kuid_t uid) { if (uid_eq(current_fsuid(), inode->i_uid) && @@ -90,9 +149,8 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr) if (!inode_owner_or_capable(inode)) return -EPERM; /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_or_capable(inode, (ia_valid & ATTR_GID) ? + attr->ia_gid : inode->i_gid)) attr->ia_mode &= ~S_ISGID; } @@ -193,9 +251,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_or_capable(inode, inode->i_gid)) mode &= ~S_ISGID; inode->i_mode = mode; } @@ -297,7 +353,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de } } if (ia_valid & ATTR_KILL_SGID) { - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 97cd4df04060..e11818801148 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -236,15 +236,32 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, size[0] = 8; /* sizeof __le64 */ data[0] = ptr; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } + } if (rc) goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); + num_rqst++; trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_INFO: diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b137006f0fd2..4409f56fc37e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -312,7 +312,7 @@ static int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst) { - int rc = 0; + int rc; struct kvec *iov; int n_vec; unsigned int send_length = 0; @@ -323,6 +323,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct msghdr smb_msg = {}; __be32 rfc1002_marker; + cifs_in_send_inc(server); if (cifs_rdma_enabled(server)) { /* return -EAGAIN when connecting or reconnecting */ rc = -EAGAIN; @@ -331,14 +332,17 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, goto smbd_done; } + rc = -EAGAIN; if (ssocket == NULL) - return -EAGAIN; + goto out; + rc = -ERESTARTSYS; if (fatal_signal_pending(current)) { cifs_dbg(FYI, "signal pending before send request\n"); - return -ERESTARTSYS; + goto out; } + rc = 0; /* cork the socket */ tcp_sock_set_cork(ssocket->sk, true); @@ -449,7 +453,8 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc); else if (rc > 0) rc = 0; - +out: + cifs_in_send_dec(server); return rc; } @@ -826,9 +831,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, * I/O response may come back and free the mid entry on another thread. */ cifs_save_when_sent(mid); - cifs_in_send_inc(server); rc = smb_send_rqst(server, 1, rqst, flags); - cifs_in_send_dec(server); if (rc < 0) { revert_current_mid(server, mid->credits); @@ -1117,9 +1120,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, else midQ[i]->callback = cifs_compound_last_callback; } - cifs_in_send_inc(server); rc = smb_send_rqst(server, num_rqst, rqst, flags); - cifs_in_send_dec(server); for (i = 0; i < num_rqst; i++) cifs_save_when_sent(midQ[i]); @@ -1356,9 +1357,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, midQ->mid_state = MID_REQUEST_SUBMITTED; - cifs_in_send_inc(server); rc = smb_send(server, in_buf, len); - cifs_in_send_dec(server); cifs_save_when_sent(midQ); if (rc < 0) @@ -1495,9 +1494,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, } midQ->mid_state = MID_REQUEST_SUBMITTED; - cifs_in_send_inc(server); rc = smb_send(server, in_buf, len); - cifs_in_send_dec(server); cifs_save_when_sent(midQ); if (rc < 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1a654a1f3f46..6ba185b46ba3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4721,13 +4721,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, goto bad_inode; raw_inode = ext4_raw_inode(&iloc); - if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - ext4_error_inode(inode, function, line, 0, - "iget: root inode unallocated"); - ret = -EFSCORRUPTED; - goto bad_inode; - } - if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; @@ -4800,11 +4793,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if ((inode->i_mode == 0 || + if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { - /* this inode is deleted */ - ret = -ESTALE; + /* this inode is deleted or unallocated */ + if (flags & EXT4_IGET_SPECIAL) { + ext4_error_inode(inode, function, line, 0, + "iget: special inode unallocated"); + ret = -EFSCORRUPTED; + } else + ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1f47aeca7142..45f719c1e002 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3934,10 +3934,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = ext4_rename_dir_prepare(handle, &old); - if (retval) { - inode_unlock(old.inode); + if (retval) goto end_rename; - } } /* * If we're renaming a file within an inline_data dir and adding or diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 60e122761352..f3da1f2d4cb9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -386,6 +386,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; + /* + * We have to check for this corruption early as otherwise + * iget_locked() could wait indefinitely for the state of our + * parent inode. + */ + if (parent->i_ino == ea_ino) { + ext4_error(parent->i_sb, + "Parent and EA inode have the same ino %lu", ea_ino); + return -EFSCORRUPTED; + } + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); diff --git a/fs/inode.c b/fs/inode.c index 9f49e0bdc2f7..7ec90788d8be 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1854,35 +1854,6 @@ void touch_atime(const struct path *path) } EXPORT_SYMBOL(touch_atime); -/* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = d_inode(dentry)->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. @@ -1897,7 +1868,7 @@ int dentry_needs_remove_privs(struct dentry *dentry) if (IS_NOSEC(inode)) return 0; - mask = should_remove_suid(dentry); + mask = setattr_should_drop_suidgid(inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -2147,10 +2118,6 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; - else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && - !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(dir, CAP_FSETID)) - mode &= ~S_ISGID; } else inode->i_gid = current_fsgid(); inode->i_mode = mode; @@ -2382,3 +2349,48 @@ int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, return 0; } EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); + +/** + * in_group_or_capable - check whether caller is CAP_FSETID privileged + * @inode: inode to check + * @gid: the new/current gid of @inode + * + * Check wether @gid is in the caller's group list or if the caller is + * privileged with CAP_FSETID over @inode. This can be used to determine + * whether the setgid bit can be kept or must be dropped. + * + * Return: true if the caller is sufficiently privileged, false if not. + */ +bool in_group_or_capable(const struct inode *inode, kgid_t gid) +{ + if (in_group_p(gid)) + return true; + if (capable_wrt_inode_uidgid(inode, CAP_FSETID)) + return true; + return false; +} + +/** + * mode_strip_sgid - handle the sgid bit for non-directories + * @dir: parent directory inode + * @mode: mode of the file to be created in @dir + * + * If the @mode of the new file has both the S_ISGID and S_IXGRP bit + * raised and @dir has the S_ISGID bit raised ensure that the caller is + * either in the group of the parent directory or they have CAP_FSETID + * in their user namespace and are privileged over the parent directory. + * In all other cases, strip the S_ISGID bit from @mode. + * + * Return: the new mode to use for the file + */ +umode_t mode_strip_sgid(const struct inode *dir, umode_t mode) +{ + if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) + return mode; + if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) + return mode; + if (in_group_or_capable(dir, dir->i_gid)) + return mode; + return mode & ~S_ISGID; +} +EXPORT_SYMBOL(mode_strip_sgid); diff --git a/fs/internal.h b/fs/internal.h index 06d313b9beec..d5d9fcdae10c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -149,6 +149,7 @@ extern int vfs_open(const struct path *, struct file *); extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); +bool in_group_or_capable(const struct inode *inode, kgid_t gid); /* * fs-writeback.c @@ -196,3 +197,8 @@ int sb_init_dio_done_wq(struct super_block *sb); */ int do_statx(int dfd, const char __user *filename, unsigned flags, unsigned int mask, struct statx __user *buffer); + +/* + * fs/attr.c + */ +int setattr_should_drop_sgid(const struct inode *inode); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index bd7d58d27bfc..97a3c09fd96b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -138,19 +138,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); pgoff_t index = pos >> PAGE_SHIFT; - uint32_t pageofs = index << PAGE_SHIFT; int ret = 0; jffs2_dbg(1, "%s()\n", __func__); - if (pageofs > inode->i_size) { - /* Make new hole frag from old EOF to new page */ + if (pos > inode->i_size) { + /* Make new hole frag from old EOF to new position */ struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; uint32_t alloc_len; - jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", - (unsigned int)inode->i_size, pageofs); + jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new position\n", + (unsigned int)inode->i_size, (uint32_t)pos); ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); @@ -170,10 +169,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ri.mode = cpu_to_jemode(inode->i_mode); ri.uid = cpu_to_je16(i_uid_read(inode)); ri.gid = cpu_to_je16(i_gid_read(inode)); - ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs)); + ri.isize = cpu_to_je32((uint32_t)pos); ri.atime = ri.ctime = ri.mtime = cpu_to_je32(JFFS2_NOW()); ri.offset = cpu_to_je32(inode->i_size); - ri.dsize = cpu_to_je32(pageofs - inode->i_size); + ri.dsize = cpu_to_je32((uint32_t)pos - inode->i_size); ri.csize = cpu_to_je32(0); ri.compr = JFFS2_COMPR_ZERO; ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8)); @@ -203,7 +202,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, goto out_err; } jffs2_complete_reservation(c); - inode->i_size = pageofs; + inode->i_size = pos; mutex_unlock(&f->sem); } diff --git a/fs/namei.c b/fs/namei.c index 4159c140fa47..3d98db9802a7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2798,6 +2798,63 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); +/** + * mode_strip_umask - handle vfs umask stripping + * @dir: parent directory of the new inode + * @mode: mode of the new inode to be created in @dir + * + * Umask stripping depends on whether or not the filesystem supports POSIX + * ACLs. If the filesystem doesn't support it umask stripping is done directly + * in here. If the filesystem does support POSIX ACLs umask stripping is + * deferred until the filesystem calls posix_acl_create(). + * + * Returns: mode + */ +static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) +{ + if (!IS_POSIXACL(dir)) + mode &= ~current_umask(); + return mode; +} + +/** + * vfs_prepare_mode - prepare the mode to be used for a new inode + * @dir: parent directory of the new inode + * @mode: mode of the new inode + * @mask_perms: allowed permission by the vfs + * @type: type of file to be created + * + * This helper consolidates and enforces vfs restrictions on the @mode of a new + * object to be created. + * + * Umask stripping depends on whether the filesystem supports POSIX ACLs (see + * the kernel documentation for mode_strip_umask()). Moving umask stripping + * after setgid stripping allows the same ordering for both non-POSIX ACL and + * POSIX ACL supporting filesystems. + * + * Note that it's currently valid for @type to be 0 if a directory is created. + * Filesystems raise that flag individually and we need to check whether each + * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a + * non-zero type. + * + * Returns: mode to be passed to the filesystem + */ +static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode, + umode_t mask_perms, umode_t type) +{ + mode = mode_strip_sgid(dir, mode); + mode = mode_strip_umask(dir, mode); + + /* + * Apply the vfs mandated allowed permission mask and set the type of + * file to be created before we call into the filesystem. + */ + mode &= (mask_perms & ~S_IFMT); + mode |= (type & S_IFMT); + + return mode; +} + int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { @@ -2807,8 +2864,8 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ - mode &= S_IALLUGO; - mode |= S_IFREG; + + mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; @@ -3072,8 +3129,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); + mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(&nd->path, dentry, mode); else @@ -3286,8 +3342,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) child = d_alloc(dentry, &slash_name); if (unlikely(!child)) goto out_err; - if (!IS_POSIXACL(dir)) - mode &= ~current_umask(); + mode = vfs_prepare_mode(dir, mode, mode, mode); error = dir->i_op->tmpfile(dir, child, mode); if (error) goto out_err; @@ -3548,6 +3603,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (!dir->i_op->mknod) return -EPERM; + mode = vfs_prepare_mode(dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; @@ -3596,9 +3652,8 @@ static long do_mknodat(int dfd, const char __user *filename, umode_t mode, if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&path, dentry, mode, dev); + error = security_path_mknod(&path, dentry, + mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out; switch (mode & S_IFMT) { @@ -3646,7 +3701,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (!dir->i_op->mkdir) return -EPERM; - mode &= (S_IRWXUGO|S_ISVTX); + mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; @@ -3673,9 +3728,8 @@ static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); - error = security_path_mkdir(&path, dentry, mode); + error = security_path_mkdir(&path, dentry, + mode_strip_umask(path.dentry->d_inode, mode)); if (!error) error = vfs_mkdir(path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1470b49adb2d..ca00cac5a12f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1994,7 +1994,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && should_remove_suid(file->f_path.dentry)) { + if (file && setattr_should_drop_suidgid(file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2282,7 +2282,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (should_remove_suid(dentry)) { + if (setattr_should_drop_suidgid(inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 856474b0a1ae..df1f6b7aa797 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); + mode = mode_strip_sgid(dir, mode); inode_init_owner(inode, dir, mode); status = dquot_initialize(inode); if (status) diff --git a/fs/open.c b/fs/open.c index b3fbb4300fc9..1ca4b236fdbe 100644 --- a/fs/open.c +++ b/fs/open.c @@ -665,10 +665,10 @@ int chown_common(const struct path *path, uid_t user, gid_t group) newattrs.ia_valid |= ATTR_GID; newattrs.ia_gid = gid; } - if (!S_ISDIR(inode->i_mode)) - newattrs.ia_valid |= - ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | + setattr_should_drop_sgid(inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs, &delegated_inode); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 24c7d30e41df..0926363179a7 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -3190,7 +3190,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3270,7 +3270,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3290,7 +3290,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3375,6 +3375,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 7371a7f7c652..fbab1042bc90 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -800,9 +800,6 @@ xfs_alloc_file_space( quota_flag = XFS_QMOPT_RES_REGBLKS; } - /* - * Allocate and setup the transaction. - */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, 0, &tp); @@ -830,9 +827,9 @@ xfs_alloc_file_space( if (error) goto error0; - /* - * Complete the transaction - */ + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4d6bf8d4974f..9b6c5ba5fdfb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -94,8 +94,6 @@ xfs_update_prealloc_flags( ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flags & XFS_PREALLOC_SYNC) - xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } @@ -852,7 +850,6 @@ xfs_file_fallocate( struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; - enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -897,6 +894,10 @@ xfs_file_fallocate( goto out_unlock; } + error = file_modified(file); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -946,8 +947,6 @@ xfs_file_fallocate( } do_file_insert = true; } else { - flags |= XFS_PREALLOC_SET; - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -1000,13 +999,6 @@ xfs_file_fallocate( } } - if (file->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - - error = xfs_update_prealloc_flags(ip, flags); - if (error) - goto out_unlock; - /* Change file size if needed */ if (new_size) { struct iattr iattr; @@ -1024,8 +1016,14 @@ xfs_file_fallocate( * leave shifted extents past EOF and hence losing access to * the data that is contained within them. */ - if (do_file_insert) + if (do_file_insert) { error = xfs_insert_file_space(ip, offset, len); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + error = xfs_log_force_inode(ip); out_unlock: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 6a3026e78a9b..69fef29df428 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -595,37 +595,6 @@ xfs_vn_getattr( return 0; } -static void -xfs_setattr_mode( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; -} - -void -xfs_setattr_time( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (iattr->ia_valid & ATTR_ATIME) - inode->i_atime = iattr->ia_atime; - if (iattr->ia_valid & ATTR_CTIME) - inode->i_ctime = iattr->ia_ctime; - if (iattr->ia_valid & ATTR_MTIME) - inode->i_mtime = iattr->ia_mtime; -} - static int xfs_vn_change_ok( struct dentry *dentry, @@ -740,16 +709,6 @@ xfs_setattr_nonsize( goto out_cancel; } - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((inode->i_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) - inode->i_mode &= ~(S_ISUID|S_ISGID); - /* * Change the ownerships and register quota modifications * in the transaction. @@ -761,7 +720,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - inode->i_uid = uid; } if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { @@ -772,15 +730,10 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - inode->i_gid = gid; } } - if (mask & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + setattr_copy(inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -1025,11 +978,8 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (iattr->ia_valid & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 4d24ff309f59..dd1bd0332f8e 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -18,7 +18,6 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); */ #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ -extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a2a5a0fd9233..402cf828cc91 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -126,7 +126,6 @@ __xfs_free_perag( { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -145,7 +144,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f3082a957d5e..053b99929f83 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -164,10 +164,12 @@ xfs_fs_map_blocks( * that the blocks allocated and handed out to the client are * guaranteed to be present even after a server crash. */ - error = xfs_update_prealloc_flags(ip, - XFS_PREALLOC_SET | XFS_PREALLOC_SYNC); + error = xfs_update_prealloc_flags(ip, XFS_PREALLOC_SET); + if (!error) + error = xfs_log_force_inode(ip); if (error) goto out_unlock; + } else { xfs_iunlock(ip, lock_flags); } @@ -283,7 +285,8 @@ xfs_fs_commit_blocks( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_setattr_time(ip, iattr); + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_d.di_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 64e5da33733b..3c17e0c0f816 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1318,8 +1318,15 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) + if (error) { + /* + * The inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and + * tearing down the quotainfo, or else the dquots will leak. + */ + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; + } /* * We've made all the changes that we need to make incore. Flush them diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h index 2195daa289d2..055486e35e68 100644 --- a/include/drm/drm_bridge.h +++ b/include/drm/drm_bridge.h @@ -427,11 +427,11 @@ struct drm_bridge_funcs { * * The returned array must be allocated with kmalloc() and will be * freed by the caller. If the allocation fails, NULL should be - * returned. num_output_fmts must be set to the returned array size. + * returned. num_input_fmts must be set to the returned array size. * Formats listed in the returned array should be listed in decreasing * preference order (the core will try all formats until it finds one * that works). When the format is not supported NULL should be - * returned and num_output_fmts should be set to 0. + * returned and num_input_fmts should be set to 0. * * This method is called on all elements of the bridge chain as part of * the bus format negotiation process that happens in diff --git a/include/linux/fs.h b/include/linux/fs.h index 74e19bccbf73..8ce9e5c61ede 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1768,6 +1768,7 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); +umode_t mode_strip_sgid(const struct inode *dir, umode_t mode); /* * This is the "filldir" function type, used by readdir() to let @@ -2959,7 +2960,7 @@ extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); -extern int should_remove_suid(struct dentry *); +extern int setattr_should_drop_suidgid(struct inode *); extern int file_remove_privs(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); @@ -3407,7 +3408,7 @@ int __init get_filesystem_list(char *buf); static inline bool is_sxid(umode_t mode) { - return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); + return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct inode *dir, struct inode *inode) diff --git a/include/linux/hid.h b/include/linux/hid.h index 2ba33d708942..256f34f49167 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -798,6 +798,7 @@ struct hid_driver { * @raw_request: send raw report request to device (e.g. feature report) * @output_report: send output report to device * @idle: send idle request to device + * @max_buffer_size: over-ride maximum data buffer size (default: HID_MAX_BUFFER_SIZE) */ struct hid_ll_driver { int (*start)(struct hid_device *hdev); @@ -822,6 +823,8 @@ struct hid_ll_driver { int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len); int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype); + + unsigned int max_buffer_size; }; extern struct hid_ll_driver i2c_hid_ll_driver; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b478a16ef284..9ef63bc14b00 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -270,9 +270,11 @@ struct hh_cache { * relationship HH alignment <= LL alignment. */ #define LL_RESERVED_SPACE(dev) \ - ((((dev)->hard_header_len+(dev)->needed_headroom)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD) + ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \ + & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) #define LL_RESERVED_SPACE_EXTRA(dev,extra) \ - ((((dev)->hard_header_len+(dev)->needed_headroom+(extra))&~(HH_DATA_MOD - 1)) + HH_DATA_MOD) + ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \ + & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) struct header_ops { int (*create) (struct sk_buff *skb, struct net_device *dev, diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h index c255273b0281..37ad81058d6a 100644 --- a/include/linux/sh_intc.h +++ b/include/linux/sh_intc.h @@ -97,7 +97,10 @@ struct intc_hw_desc { unsigned int nr_subgroups; }; -#define _INTC_ARRAY(a) a, __same_type(a, NULL) ? 0 : sizeof(a)/sizeof(*a) +#define _INTC_SIZEOF_OR_ZERO(a) (_Generic(a, \ + typeof(NULL): 0, \ + default: sizeof(a))) +#define _INTC_ARRAY(a) a, _INTC_SIZEOF_OR_ZERO(a)/sizeof(*a) #define INTC_HW_DESC(vectors, groups, mask_regs, \ prio_regs, sense_regs, ack_regs) \ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index e4c5df71f0e7..4e1356c35fe6 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -234,12 +234,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. * - * When lockdep is enabled, we make sure to always do the RCU portions of - * the tracepoint code, regardless of whether tracing is on. However, - * don't check if the condition is false, due to interaction with idle - * instrumentation. This lets us find RCU issues triggered with tracepoints - * even when this tracepoint is off. This code has no purpose other than - * poking RCU a bit. + * When lockdep is enabled, we make sure to always test if RCU is + * "watching" regardless if the tracepoint is enabled or not. Tracepoints + * require RCU to be active, and it should always warn at the tracepoint + * site if it is not watching, as it will need to be active when the + * tracepoint is enabled. */ #define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ extern int __traceiter_##name(data_proto); \ @@ -253,9 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) TP_ARGS(data_args), \ TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - rcu_read_lock_sched_notrace(); \ - rcu_dereference_sched(__tracepoint_##name.funcs);\ - rcu_read_unlock_sched_notrace(); \ + WARN_ON_ONCE(!rcu_is_watching()); \ } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 445afda927f4..fd799567fc23 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -5792,10 +5792,10 @@ static int io_arm_poll_handler(struct io_kiocb *req) } } else { apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; apoll->poll.retries = APOLL_MAX_RETRY; } - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; apoll->double_poll = NULL; req->apoll = apoll; req->flags |= REQ_F_POLLED; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d97c189695cb..67829b6e07bd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1538,7 +1538,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) key.flags = end; /* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || + if (pg->index == 0 || + end < pg->records[0].ip || start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) continue; rec = bsearch(&key, pg->records, pg->index, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8637eab2986e..ce45bdd9077d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4705,6 +4705,8 @@ loff_t tracing_lseek(struct file *file, loff_t offset, int whence) static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, + .read_iter = seq_read_iter, + .splice_read = generic_file_splice_read, .write = tracing_write_stub, .llseek = tracing_lseek, .release = tracing_release, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ccc99cd23f3c..9ed65191888e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1087,6 +1087,9 @@ static const char *hist_field_name(struct hist_field *field, { const char *field_name = ""; + if (WARN_ON_ONCE(!field)) + return field_name; + if (level > 1) return field_name; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9b15760e0541..e4c690c21fc9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1994,7 +1994,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; - pmd_t _pmd; + pmd_t _pmd, old_pmd; int i; /* @@ -2005,7 +2005,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - pmdp_huge_clear_flush(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2014,6 +2014,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pte_t *pte, entry; entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); entry = pte_mkspecial(entry); + if (pmd_uffd_wp(old_pmd)) + entry = pte_mkuffd_wp(entry); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 5f786ef662ea..41f890bf9d4c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -573,6 +573,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, cfg->fc_scope = RT_SCOPE_UNIVERSE; } + if (!cfg->fc_table) + cfg->fc_table = RT_TABLE_MAIN; + if (cmd == SIOCDELRT) return 0; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index be75b409445c..99f70b990eb1 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -613,10 +613,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; - if (headroom > dev->needed_headroom) - dev->needed_headroom = headroom; + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); - if (skb_cow_head(skb, dev->needed_headroom)) { + if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { ip_rt_put(rt); goto tx_dropped; } @@ -797,10 +797,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; + if (max_headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, max_headroom); - if (skb_cow_head(skb, dev->needed_headroom)) { + if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { ip_rt_put(rt); dev->stats.tx_dropped++; kfree_skb(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eefd032bc6db..e4ad274ec7a3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3609,7 +3609,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); - __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0d4cab94c5dd..a03a322e0cc1 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1267,8 +1267,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; + if (max_headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 349c6ac3313f..6f84978a7726 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -83,7 +83,7 @@ struct iucv_irq_data { u16 ippathid; u8 ipflags1; u8 iptype; - u32 res2[8]; + u32 res2[9]; }; struct iucv_irq_list { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3b154ad4945c..607519246bf2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -275,7 +275,6 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; - tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 9953e8053753..1818dbf089ca 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - u32 plen = sizeof_field(struct nf_nat_range, min_addr.all); + u32 plen = sizeof_field(struct nf_nat_range, min_proto.all); struct nft_masq *priv = nft_expr_priv(expr); int err; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index db8f9116eeb4..cd4eb4996aff 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -226,7 +226,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_MAP_IPS; } - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index ba09890dddb5..e64f531d66cf 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - plen = sizeof_field(struct nf_nat_range, min_addr.all); + plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); @@ -232,7 +232,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = { .name = "redir", .ops = &nft_redir_inet_ops, .policy = nft_redir_policy, - .maxattr = NFTA_MASQ_MAX, + .maxattr = NFTA_REDIR_MAX, .owner = THIS_MODULE, }; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 94503f36b9a6..9125d28d9ff5 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -104,6 +104,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, union smc_host_cursor cfed; int rc; + if (unlikely(!READ_ONCE(conn->sndbuf_desc))) + return -ENOBUFS; + smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bf485a2017a4..e84241ff4ac4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -912,7 +912,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ - cancel_delayed_work_sync(&lgr->free_work); + cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index fdbd56ed4bd5..ba73014805a4 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2611,9 +2611,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) if (inner_mode == NULL) goto error; - if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) - goto error; - x->inner_mode = *inner_mode; if (x->props.family == AF_INET) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index 2a5ba9dca6b0..f96e70c85f84 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -359,6 +359,15 @@ static const struct config_entry config_table[] = { }, #endif +/* Meteor Lake */ +#if IS_ENABLED(CONFIG_SND_SOC_SOF_METEORLAKE) + /* Meteorlake-P */ + { + .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, + .device = 0x7e28, + }, +#endif + }; static const struct config_entry *snd_intel_dsp_find_config diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 494bfd2135a9..de1fe604905f 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -365,14 +365,15 @@ enum { #define needs_eld_notify_link(chip) false #endif -#define CONTROLLER_IN_GPU(pci) (((pci)->device == 0x0a0c) || \ +#define CONTROLLER_IN_GPU(pci) (((pci)->vendor == 0x8086) && \ + (((pci)->device == 0x0a0c) || \ ((pci)->device == 0x0c0c) || \ ((pci)->device == 0x0d0c) || \ ((pci)->device == 0x160c) || \ ((pci)->device == 0x490d) || \ ((pci)->device == 0x4f90) || \ ((pci)->device == 0x4f91) || \ - ((pci)->device == 0x4f92)) + ((pci)->device == 0x4f92))) #define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f2ef75c8de42..2cf6600c9ca8 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9091,6 +9091,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc830, "Samsung Galaxy Book Ion (NT950XCJ-X716A)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x144d, 0xc832, "Samsung Galaxy Book Flex Alpha (NP730QCJ)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP), + SND_PCI_QUIRK(0x144d, 0xc868, "Samsung Galaxy Book2 Pro (NP930XED)", ALC298_FIXUP_SAMSUNG_AMP), SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py index 834066d465fc..f0fbd7367f4f 100755 --- a/tools/testing/selftests/net/devlink_port_split.py +++ b/tools/testing/selftests/net/devlink_port_split.py @@ -57,6 +57,8 @@ class devlink_ports(object): assert stderr == "" ports = json.loads(stdout)['port'] + validate_devlink_output(ports, 'flavour') + for port in ports: if dev in port: if ports[port]['flavour'] == 'physical': @@ -218,6 +220,27 @@ def split_splittable_port(port, k, lanes, dev): unsplit(port.bus_info) +def validate_devlink_output(devlink_data, target_property=None): + """ + Determine if test should be skipped by checking: + 1. devlink_data contains values + 2. The target_property exist in devlink_data + """ + skip_reason = None + if any(devlink_data.values()): + if target_property: + skip_reason = "{} not found in devlink output, test skipped".format(target_property) + for key in devlink_data: + if target_property in devlink_data[key]: + skip_reason = None + else: + skip_reason = 'devlink output is empty, test skipped' + + if skip_reason: + print(skip_reason) + sys.exit(KSFT_SKIP) + + def make_parser(): parser = argparse.ArgumentParser(description='A test for port splitting.') parser.add_argument('--dev', @@ -238,6 +261,7 @@ def main(cmdline=None): stdout, stderr = run_command(cmd) assert stderr == "" + validate_devlink_output(json.loads(stdout)) devs = json.loads(stdout)['dev'] dev = list(devs.keys())[0] @@ -249,6 +273,7 @@ def main(cmdline=None): ports = devlink_ports(dev) + found_max_lanes = False for port in ports.if_names: max_lanes = get_max_lanes(port.name) @@ -271,6 +296,11 @@ def main(cmdline=None): split_splittable_port(port, lane, max_lanes, dev) lane //= 2 + found_max_lanes = True + + if not found_max_lanes: + print(f"Test not started, no port of device {dev} reports max_lanes") + sys.exit(KSFT_SKIP) if __name__ == "__main__":