diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst index 5b36e45fef60..edb533128712 100644 --- a/Documentation/riscv/vm-layout.rst +++ b/Documentation/riscv/vm-layout.rst @@ -47,7 +47,7 @@ RISC-V Linux Kernel SV39 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffffffc6fee00000 | -228 GB | ffffffc6feffffff | 2 MB | fixmap + ffffffc6fea00000 | -228 GB | ffffffc6feffffff | 6 MB | fixmap ffffffc6ff000000 | -228 GB | ffffffc6ffffffff | 16 MB | PCI io ffffffc700000000 | -228 GB | ffffffc7ffffffff | 4 GB | vmemmap ffffffc800000000 | -224 GB | ffffffd7ffffffff | 64 GB | vmalloc/ioremap space @@ -83,7 +83,7 @@ RISC-V Linux Kernel SV48 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffff8d7ffee00000 | -114.5 TB | ffff8d7ffeffffff | 2 MB | fixmap + ffff8d7ffea00000 | -114.5 TB | ffff8d7ffeffffff | 6 MB | fixmap ffff8d7fff000000 | -114.5 TB | ffff8d7fffffffff | 16 MB | PCI io ffff8d8000000000 | -114.5 TB | ffff8f7fffffffff | 2 TB | vmemmap ffff8f8000000000 | -112.5 TB | ffffaf7fffffffff | 32 TB | vmalloc/ioremap space diff --git a/Makefile b/Makefile index d2eff3747f2f..a5cfcd0a85a9 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 26 +SUBLEVEL = 27 EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 019472dd98ff..54ccdcc2dbdf 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1178,6 +1178,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != FSC_PERM || + (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } + /* * Let's check if we will get back a huge page backed by hugetlbfs, or * get block mapping for device MMIO region. @@ -1234,36 +1248,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_ipa &= ~(vma_pagesize - 1); gfn = fault_ipa >> PAGE_SHIFT; - mmap_read_unlock(current->mm); - - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - if (fault_status != FSC_PERM || (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; - } - mmu_seq = vcpu->kvm->mmu_invalidate_seq; /* - * Ensure the read of mmu_invalidate_seq happens before we call - * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk - * the page we just got a reference to gets unmapped before we have a - * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_gfn will take it away - * from us again properly. This smp_rmb() interacts with the smp_wmb() - * in kvm_mmu_notifier_invalidate_<page|range_end>. + * Read mmu_invalidate_seq so that KVM can detect if the results of + * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * acquiring kvm->mmu_lock. * - * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is - * used to avoid unnecessary overhead introduced to locate the memory - * slot because it's always fixed even @gfn is adjusted for huge pages. + * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs + * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - smp_rmb(); + mmu_seq = vcpu->kvm->mmu_invalidate_seq; + mmap_read_unlock(current->mm); pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, write_fault, &writable, NULL); diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 5c3e7b97fcc6..0a55099bb734 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -22,6 +22,14 @@ */ enum fixed_addresses { FIX_HOLE, + /* + * The fdt fixmap mapping must be PMD aligned and will be mapped + * using PMD entries in fixmap_pmd in 64-bit and a PGD entry in 32-bit. + */ + FIX_FDT_END, + FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1, + + /* Below fixmaps will be mapped using fixmap_pte */ FIX_PTE, FIX_PMD, FIX_PUD, diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 92ec2d9d7273..2aeaf8e3a4ab 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -87,9 +87,13 @@ #define FIXADDR_TOP PCI_IO_START #ifdef CONFIG_64BIT -#define FIXADDR_SIZE PMD_SIZE +#define MAX_FDT_SIZE PMD_SIZE +#define FIX_FDT_SIZE (MAX_FDT_SIZE + SZ_2M) +#define FIXADDR_SIZE (PMD_SIZE + FIX_FDT_SIZE) #else -#define FIXADDR_SIZE PGDIR_SIZE +#define MAX_FDT_SIZE PGDIR_SIZE +#define FIX_FDT_SIZE MAX_FDT_SIZE +#define FIXADDR_SIZE (PGDIR_SIZE + FIX_FDT_SIZE) #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 86acd690d529..2acf51c23567 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -278,12 +278,8 @@ void __init setup_arch(char **cmdline_p) #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa)))) - unflatten_device_tree(); - else - pr_err("No DTB found in kernel mappings\n"); + unflatten_device_tree(); #endif - early_init_fdt_scan_reserved_mem(); misc_mem_init(); init_resources(); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 50a1b6edd491..6f47ced3175b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -57,7 +57,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -#define DTB_EARLY_BASE_VA PGDIR_SIZE void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; @@ -236,31 +235,22 @@ static void __init setup_bootmem(void) set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); + + /* + * No allocation should be done before reserving the memory as defined + * in the device tree, otherwise the allocation could end up in a + * reserved region. + */ + early_init_fdt_scan_reserved_mem(); + /* * If DTB is built in, no need to reserve its memblock. * Otherwise, do reserve it but avoid using * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { - /* - * In case the DTB is not located in a memory region we won't - * be able to locate it later on via the linear mapping and - * get a segfault when accessing it via __va(dtb_early_pa). - * To avoid this situation copy DTB to a memory region. - * Note that memblock_phys_alloc will also reserve DTB region. - */ - if (!memblock_is_memory(dtb_early_pa)) { - size_t fdt_size = fdt_totalsize(dtb_early_va); - phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); - void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); - - memcpy(new_dtb_early_va, dtb_early_va, fdt_size); - early_memunmap(new_dtb_early_va, fdt_size); - _dtb_early_pa = new_dtb_early_pa; - } else - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); - } + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); dma_contiguous_reserve(dma32_phys_limit); if (IS_ENABLED(CONFIG_64BIT)) @@ -279,9 +269,6 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); -static p4d_t __maybe_unused early_dtb_p4d[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); -static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) @@ -626,9 +613,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define trampoline_pgd_next (pgtable_l5_enabled ? \ (uintptr_t)trampoline_p4d : (pgtable_l4_enabled ? \ (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)) -#define early_dtb_pgd_next (pgtable_l5_enabled ? \ - (uintptr_t)early_dtb_p4d : (pgtable_l4_enabled ? \ - (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) @@ -636,7 +620,6 @@ static void __init create_p4d_mapping(p4d_t *p4dp, #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) #define fixmap_pgd_next ((uintptr_t)fixmap_pte) -#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) #define create_p4d_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) @@ -859,32 +842,28 @@ static void __init create_kernel_page_table(pgd_t *pgdir, bool early) * this means 2 PMD entries whereas for 32-bit kernel, this is only 1 PGDIR * entry. */ -static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) +static void __init create_fdt_early_page_table(pgd_t *pgdir, + uintptr_t fix_fdt_va, + uintptr_t dtb_pa) { -#ifndef CONFIG_BUILTIN_DTB uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); - create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, - PGDIR_SIZE, - IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); - - if (pgtable_l5_enabled) - create_p4d_mapping(early_dtb_p4d, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pud, P4D_SIZE, PAGE_TABLE); - - if (pgtable_l4_enabled) - create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, - (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); +#ifndef CONFIG_BUILTIN_DTB + /* Make sure the fdt fixmap address is always aligned on PMD size */ + BUILD_BUG_ON(FIX_FDT % (PMD_SIZE / PAGE_SIZE)); - if (IS_ENABLED(CONFIG_64BIT)) { - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + /* In 32-bit only, the fdt lies in its own PGD */ + if (!IS_ENABLED(CONFIG_64BIT)) { + create_pgd_mapping(early_pg_dir, fix_fdt_va, + pa, MAX_FDT_SIZE, PAGE_KERNEL); + } else { + create_pmd_mapping(fixmap_pmd, fix_fdt_va, pa, PMD_SIZE, PAGE_KERNEL); - create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + create_pmd_mapping(fixmap_pmd, fix_fdt_va + PMD_SIZE, pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); } - dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); + dtb_early_va = (void *)fix_fdt_va + (dtb_pa & (PMD_SIZE - 1)); #else /* * For 64-bit kernel, __va can't be used since it would return a linear @@ -1054,7 +1033,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_kernel_page_table(early_pg_dir, true); /* Setup early mapping for FDT early scan */ - create_fdt_early_page_table(early_pg_dir, dtb_pa); + create_fdt_early_page_table(early_pg_dir, + __fix_to_virt(FIX_FDT), dtb_pa); /* * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap @@ -1096,6 +1076,16 @@ static void __init setup_vm_final(void) u64 i; /* Setup swapper PGD for fixmap */ +#if !defined(CONFIG_64BIT) + /* + * In 32-bit, the device tree lies in a pgd entry, so it must be copied + * directly in swapper_pg_dir in addition to the pgd entry that points + * to fixmap_pte. + */ + unsigned long idx = pgd_index(__fix_to_virt(FIX_FDT)); + + set_pgd(&swapper_pg_dir[idx], early_pg_dir[idx]); +#endif create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index d2e95d1d4db7..1aa64846e539 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -3,9 +3,14 @@ core-y += arch/x86/crypto/ # # Disable SSE and other FP/SIMD instructions to match normal x86 +# This is required to work around issues in older LLVM versions, but breaks +# GCC versions < 11. See: +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652 # +ifeq ($(CONFIG_CC_IS_CLANG),y) KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 +endif ifeq ($(CONFIG_X86_32),y) START := 0x8048000 diff --git a/drivers/base/dd.c b/drivers/base/dd.c index c463173f1fb1..97ab1468a876 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -718,7 +718,12 @@ static int really_probe_debug(struct device *dev, struct device_driver *drv) calltime = ktime_get(); ret = really_probe(dev, drv); rettime = ktime_get(); - pr_debug("probe of %s returned %d after %lld usecs\n", + /* + * Don't change this to pr_debug() because that requires + * CONFIG_DYNAMIC_DEBUG and we want a simple 'initcall_debug' on the + * kernel commandline to print this all the time at the debug level. + */ + printk(KERN_DEBUG "probe of %s returned %d after %lld usecs\n", dev_name(dev), ret, ktime_us_delta(rettime, calltime)); return ret; } diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index fa3de3c3010c..787d47e667ad 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1600,6 +1600,19 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_interrupt = "AMDI0030:00@18", }, }, + { + /* + * Spurious wakeups from TP_ATTN# pin + * Found in BIOS 1.7.8 + * https://gitlab.freedesktop.org/drm/amd/-/issues/1722#note_1720627 + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "NL5xNU"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "ELAN0415:00@9", + }, + }, { /* * Spurious wakeups from TP_ATTN# pin diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 71edb80fe0fb..06cd2f8c2773 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -1406,6 +1406,9 @@ int drm_fb_helper_check_var(struct fb_var_screeninfo *var, return -EINVAL; } + var->xres_virtual = fb->width; + var->yres_virtual = fb->height; + /* * Workaround for SDL 1.2, which is known to be setting all pixel format * fields values to zero in some cases. We treat this situation as a diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index ea8409e0e70e..12c4408bbc3b 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -5888,6 +5888,11 @@ static s32 brcmf_get_assoc_ies(struct brcmf_cfg80211_info *cfg, (struct brcmf_cfg80211_assoc_ielen_le *)cfg->extra_buf; req_len = le32_to_cpu(assoc_info->req_len); resp_len = le32_to_cpu(assoc_info->resp_len); + if (req_len > WL_EXTRA_BUF_MAX || resp_len > WL_EXTRA_BUF_MAX) { + bphy_err(drvr, "invalid lengths in assoc info: req %u resp %u\n", + req_len, resp_len); + return -EINVAL; + } if (req_len) { err = brcmf_fil_iovar_data_get(ifp, "assoc_req_ies", cfg->extra_buf, diff --git a/drivers/phy/broadcom/phy-brcm-usb.c b/drivers/phy/broadcom/phy-brcm-usb.c index 2bfd78e2d8fd..7a4c328bac58 100644 --- a/drivers/phy/broadcom/phy-brcm-usb.c +++ b/drivers/phy/broadcom/phy-brcm-usb.c @@ -445,9 +445,9 @@ static int brcm_usb_phy_dvr_init(struct platform_device *pdev, priv->suspend_clk = NULL; } - priv->wake_irq = platform_get_irq_byname(pdev, "wake"); + priv->wake_irq = platform_get_irq_byname_optional(pdev, "wake"); if (priv->wake_irq < 0) - priv->wake_irq = platform_get_irq_byname(pdev, "wakeup"); + priv->wake_irq = platform_get_irq_byname_optional(pdev, "wakeup"); if (priv->wake_irq >= 0) { err = devm_request_irq(dev, priv->wake_irq, brcm_usb_phy_wake_isr, 0, diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index fc12fee66141..f05aea57e2d8 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -595,6 +595,11 @@ static void option_instat_callback(struct urb *urb); #define SIERRA_VENDOR_ID 0x1199 #define SIERRA_PRODUCT_EM9191 0x90d3 +/* UNISOC (Spreadtrum) products */ +#define UNISOC_VENDOR_ID 0x1782 +/* TOZED LT70-C based on UNISOC SL8563 uses UNISOC's vendor ID */ +#define TOZED_PRODUCT_LT70C 0x4055 + /* Device flags */ /* Highest interface number which can be used with NCTRL() and RSVD() */ @@ -2225,6 +2230,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 937b60ae576e..35e889fe2a95 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1658,7 +1658,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) int left_ret; int right_ret; u64 left_gen; - u64 right_gen; + u64 right_gen = 0; struct btrfs_inode_info info; ret = get_inode_info(sctx->send_root, ino, &info); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 67b2aa552d22..099af8ba6fe5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2631,7 +2631,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct super_block *sb = fs_info->sb; struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_fs_devices *seed_devices; + struct btrfs_fs_devices *seed_devices = NULL; u64 orig_super_total_bytes; u64 orig_super_num_devices; int ret = 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f940395667c8..e132f70a059e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -784,70 +784,56 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; + struct vm_area_struct *merged; + unsigned long vmstart, vmend; pgoff_t pgoff; + int err; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } - vma = mas_find(&mas, end - 1); - if (WARN_ON(!vma)) + if (mpol_equal(vma_policy(vma), new_pol)) return 0; - if (start > vma->vm_start) - prev = vma; - - for (; vma; vma = mas_next(&mas, end - 1)) { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); - if (err) - goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); - } - if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); - if (err) - goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); - } -replace: - err = vma_replace_policy(vma, new_pol); + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + /* vma_merge() invalidated the mas */ + mas_pause(&vmi->mas); + return vma_replace_policy(merged, new_pol); + } + + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) - goto out; -next: - prev = vma; + return err; + /* split_vma() invalidated the mas */ + mas_pause(&vmi->mas); } -out: - return err; + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + return err; + /* split_vma() invalidated the mas */ + mas_pause(&vmi->mas); + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -1259,6 +1245,8 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; struct mempolicy *new; unsigned long end; int err; @@ -1328,7 +1316,13 @@ static long do_mbind(unsigned long start, unsigned long len, goto up_out; } - err = mbind_range(mm, start, end, new); + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } if (!err) { int nr_failed = 0; @@ -1489,10 +1483,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mempolicy *new; - unsigned long vmstart; - unsigned long vmend; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1521,9 +1513,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); new = mpol_dup(vma_policy(vma)); if (IS_ERR(new)) { err = PTR_ERR(new); @@ -1547,7 +1538,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le } new->home_node = home_node; - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 06581223238c..f597fe0db9f8 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1003,7 +1003,14 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, if (hci_sock_gen_cookie(sk)) { struct sk_buff *skb; - if (capable(CAP_NET_ADMIN)) + /* Perform careful checks before setting the HCI_SOCK_TRUSTED + * flag. Make sure that not only the current task but also + * the socket opener has the required capability, since + * privileged programs can be tricked into making ioctl calls + * on HCI sockets, and the socket should not be marked as + * trusted simply because the ioctl caller is privileged. + */ + if (sk_capable(sk, CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); /* Send event to monitor */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b1bbb0b75a13..ea46a5cb1c30 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2330,7 +2330,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push, dispose_it; + bool dispose_it, need_push = false; + + /* If the first subflow moved to a close state before accept, e.g. due + * to an incoming reset, mptcp either: + * - if either the subflow or the msk are dead, destroy the context + * (the subflow socket is deleted by inet_child_forget) and the msk + * - otherwise do nothing at the moment and take action at accept and/or + * listener shutdown - user-space must be able to accept() the closed + * socket. + */ + if (msk->in_accept_queue && msk->first == ssk) { + if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) + return; + + /* ensure later check in mptcp_worker() will dispose the msk */ + sock_set_flag(sk, SOCK_DEAD); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_drop_ctx(ssk); + goto out_release; + } dispose_it = !msk->subflow || ssk != msk->subflow->sk; if (dispose_it) @@ -2366,25 +2385,21 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); - } else if (msk->in_accept_queue && msk->first == ssk) { - /* if the first subflow moved to a close state, e.g. due to - * incoming reset and we reach here before inet_child_forget() - * the TCP stack could later try to close it via - * inet_csk_listen_stop(), or deliver it to the user space via - * accept(). - * We can't delete the subflow - or risk a double free - nor let - * the msk survive - or will be leaked in the non accept scenario: - * fallback and let TCP cope with the subflow cleanup. - */ - WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD)); - mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); + } + __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } + +out_release: release_sock(ssk); sock_put(ssk); @@ -2439,21 +2454,14 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_close_ssk(sk, ssk, subflow); } - /* if the MPC subflow has been closed before the msk is accepted, - * msk will never be accept-ed, close it now - */ - if (!msk->first && msk->in_accept_queue) { - sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); - } } -static bool mptcp_check_close_timeout(const struct sock *sk) +static bool mptcp_should_close(const struct sock *sk) { s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; struct mptcp_subflow_context *subflow; - if (delta >= TCP_TIMEWAIT_LEN) + if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) return true; /* if all subflows are in closed status don't bother with additional @@ -2661,7 +2669,7 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_check_close_timeout(sk)) { + if (mptcp_should_close(sk)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); } @@ -2906,6 +2914,14 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +void __mptcp_unaccepted_force_close(struct sock *sk) +{ + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + __mptcp_destroy_sock(sk); +} + static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { /* Concurrent splices from sk_receive_queue into receive_queue will @@ -3753,6 +3769,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + + /* Do late cleanup for the first subflow as necessary. Also + * deal with bad peers not doing a complete shutdown. + */ + if (msk->first && + unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + __mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first), 0); + if (unlikely(list_empty(&msk->conn_list))) + inet_sk_state_store(newsk, TCP_CLOSE); + } + release_sock(newsk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 051e8022d661..441feeaeb242 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -615,10 +615,12 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void __mptcp_unaccepted_force_close(struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 125d1f58d6a4..67ddbf6f2e4e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -661,9 +661,12 @@ void mptcp_subflow_drop_ctx(struct sock *ssk) if (!ctx) return; - subflow_ulp_fallback(ssk, ctx); - if (ctx->conn) - sock_put(ctx->conn); + list_del(&mptcp_subflow_ctx(ssk)->node); + if (inet_csk(ssk)->icsk_ulp_ops) { + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + } kfree_rcu(ctx, rcu); } @@ -1758,6 +1761,77 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + struct sock *sk; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + sk = subflow->conn; + msk = mptcp_sk(sk); + if (msk->dl_next || msk == head) + continue; + + sock_hold(sk); + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + sk = (struct sock *)msk; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + next = msk->dl_next; + msk->dl_next = NULL; + + __mptcp_unaccepted_force_close(sk); + release_sock(sk); + + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); + mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + sock_put(sk); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk);