Prevent possible PTE corruption while calling mmap on /dev/mem with large offset. oops info, please note the PTE value 8008000000000225. ---------------------------------8<-------------------------------------- [85739.124496] rep: Corrupted page table at address 7f63852f8000 [85739.130242] PGD ba2eb067 PUD b99c1067 PMD a2fa5067 PTE 8008000000000225 [85739.136941] Bad pagetable: 000d [#1] SMP [85739.141002] Modules linked in: cfg80211 rfkill x86_pkg_temp_thermal coretemp kvm_intel kvm bnx2 crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel microcode iTCO_wdt ipmi_si i2c_i801 iTCO_vendor_support ipmi_msghandler dcdbas shpchp lpc_ich mfd_core nfsd auth_rpcgss nfs_acl lockd sunrpc mgag200 i2c_algo_bit drm_kms_helper ttm drm i2c_core [85739.172620] CPU: 3 PID: 21900 Comm: rep Not tainted 3.15.8-200.fc20.x86_64 #1 [85739.179768] Hardware name: Dell Inc. PowerEdge R210 II/09T7VV, BIOS 2.0.4 02/29/2012 [85739.187512] task: ffff8800b9b3b160 ti: ffff8800ba270000 task.ti: ffff8800ba270000 [85739.194988] RIP: 0033:[<0000000000400773>] [<0000000000400773>] 0x400773 [85739.201799] RSP: 002b:00007fffe4ca3c80 EFLAGS: 00010213 [85739.207119] RAX: 00007f63852f8000 RBX: 0000000000000000 RCX: 00007f6384e0b8ca [85739.214249] RDX: 0000000000000001 RSI: 0000000000001000 RDI: 0000000000000000 [85739.221407] RBP: 00007fffe4ca3cc0 R08: 0000000000000003 R09: 0008000000000000 [85739.228545] R10: 0000000000000001 R11: 0000000000000206 R12: 00000000004005b0 [85739.235676] R13: 00007fffe4ca3da0 R14: 0000000000000000 R15: 0000000000000000 [85739.242835] FS: 00007f63852ea740(0000) GS:ffff88013fcc0000(0000) knlGS:0000000000000000 [85739.250925] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [85739.256669] CR2: 00007f63852f8000 CR3: 00000000b9ba0000 CR4: 00000000001407e0 ---------------------------------8<-------------------------------------- According to [1] Chapter 4 Paging, some higher bits in 64bit PTE(X86_64 || X86_32_PAE) are reserved and have to be set to zero. For example, for IA-32e and 4KB page [1] 4.5 IA-32e Paging: Table 4-19, bits 51-M(MAXPHYADDR) are reserved. So for a CPU with e.g. 48bit phys addr width, bits 51-48 have to be zero. If one of the reserved bits is set, [1] 4.7 Page-Fault Exceptions, the #PF is generated with RSVD error code. <quote> RSVD flag (bit 3). This flag is 1 if there is no valid translation for the linear address because a reserved bit was set in one of the paging-structure entries used to translate that address. (Because reserved bits are not checked in a paging-structure entry whose P flag is 0, bit 3 of the error code can be set only if bit 0 is also set.) </quote> In mmap_mem() the first check is valid_mmap_phys_addr_range(), but it always returns 1 for x86. So it's possible to use any pgoff we want and to set the PTE's reserved bits in remap_pfn_range(). Meaning there is a possibility to use mmap on /dev/mem and cause system panic. It's probably not that serious, because access to /dev/mem is limited and the system has to have the panic_on_oops set, but still I think we should check this and return error. The path for this problem is: mmap_mem() => remap_pfn_range() => page present => touch page => tlb miss => walk through paging structures => reserved bit set => #pf with rsvd flag This patch adds check for x86. With this fix mmap returns -EINVAL if the requested phys addr is larger then the supported phys addr width. [1] Intel 64 and IA-32 Architectures Software Developer's Manual, Volume 3A x86_64 reproducer ---------------------------------8<-------------------------------------- #include <stdio.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <err.h> #include <stdlib.h> #include <sys/mman.h> #define die(fmt, ...) err(1, fmt, ##__VA_ARGS__) #define OFFSET 0x8000000000000LL int main(int argc, char *argv[]) { int fd; long ps; long pgoff; char *map; char c; ps = sysconf(_SC_PAGE_SIZE); if (ps == -1) die("cannot get page size"); fd = open("/dev/mem", O_RDONLY); if (fd == -1) die("cannot open /dev/mem"); pgoff = (OFFSET + (ps - 1)) & ~(ps - 1); map = mmap(NULL, ps, PROT_READ, MAP_SHARED, fd, pgoff); if (map == MAP_FAILED) die("cannot mmap"); c = map[0]; if (munmap(map, ps) == -1) die("cannot munmap"); if (close(fd) == -1) die("cannot close"); return 0; } ---------------------------------8<-------------------------------------- x86_32_PAE reproducer ---------------------------------8<-------------------------------------- #define _GNU_SOURCE #define _LARGEFILE64_SOURCE #include <unistd.h> #include <sys/syscall.h> #include <stdio.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <err.h> #include <stdlib.h> #include <sys/mman.h> #define die(fmt, ...) err(1, fmt, ##__VA_ARGS__) /* 37th bit in PTE */ #define OFFSET 0x2000000 int main(int argc, char *argv[]) { int fd; long ps; char *map; char c; ps = sysconf(_SC_PAGE_SIZE); if (ps == -1) die("cannot get page size"); fd = open("/dev/mem", O_RDONLY|O_LARGEFILE); if (fd == -1) die("cannot open /dev/mem"); map = (char *)syscall(SYS_mmap2, NULL, ps, PROT_READ, MAP_SHARED, fd, OFFSET); if (map == MAP_FAILED) die("cannot mmap"); c = map[0]; if (munmap(map, ps) == -1) die("cannot munmap"); if (close(fd) == -1) die("cannot close"); return 0; } ---------------------------------8<-------------------------------------- V3: use len_bytes instead of count, thanks to Dave Hansen and Thomas Gleixner V2: fix pfn check in valid_mmap_phys_addr_range, thanks to Dave Hansen Signed-off-by: Frantisek Hrbata <fhrbata@xxxxxxxxxx> --- arch/x86/include/asm/io.h | 4 ++++ arch/x86/mm/mmap.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index b8237d8..49ede3c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -243,6 +243,10 @@ static inline void flush_write_buffers(void) #endif } +#define ARCH_HAS_VALID_PHYS_ADDR_RANGE +extern int valid_phys_addr_range(phys_addr_t addr, size_t count); +extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t len_bytes); + #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 919b912..77a13f8 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -31,6 +31,8 @@ #include <linux/sched.h> #include <asm/elf.h> +#include "physaddr.h" + struct va_alignment __read_mostly va_align = { .flags = -1, }; @@ -122,3 +124,13 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +int valid_phys_addr_range(phys_addr_t addr, size_t count) +{ + return addr + count <= __pa(high_memory); +} + +int valid_mmap_phys_addr_range(unsigned long pfn, size_t len_bytes) +{ + return arch_pfn_possible(pfn + (len_bytes >> PAGE_SHIFT)); +} -- 1.9.3 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>