Hi, While running openldap unit tests on a ppc64 system, we have had issues with the cp command. cp uses the FS_IOC_FIEMAP ioctl to optimize the copy and it appeared that the ext4 extent list of the file did not match all the data which was 'written' on disk. The system we use has a 64kB page size but the page size being greater than the filesystem block seems to be the top level reason of the problem. One can use a 1kB block size filesystem to reproduce the issue on a 4kB page size system. Attached is a simple test case from Anton, which creates extents as follow : lseek(48K -1) -> creates [11/1) p = mmap(128K) *(p) = 1 -> creates [0/1) with a fault lseek(128K) -> creates [31/1) *(p + 49K) = 1 -> creates [12/1) and then merges in [11/2) munmap(128K) On a 4kB page size system, the extent list returned by FS_IOC_FIEMAP looks correct : Extent 0: logical: 0 physical: 0 length: 4096 flags 0x006 Extent 1: logical: 45056 physical: 0 length: 8192 flags 0x006 Extent 2: logical: 126976 physical: 0 length: 4096 flags 0x007 But, with a 64kB page size, we miss the in-the-middle extent (no page fault but the data is on disk) : Extent 0: logical: 0 physical: 0 length: 49152 flags 0x006 Extent 1: logical: 126976 physical: 0 length: 4096 flags 0x007 This looks wrong. Right ? Or are we doing something wrong ? I have been digging in the ext4 page writeback code. There are some caveats when blocksize < pagesize but I am not sure my understanding is correct. Many thanks, C.
/* * mmap vs extent issue * * Copyright (C) 2014 Anton Blanchard <anton@xxxxxxxxxx>, IBM * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <stdio.h> #include <string.h> #include <stdlib.h> #include <unistd.h> #include <sys/types.h> #include <sys/mman.h> #include <sys/ioctl.h> #include <linux/fs.h> #include <linux/fiemap.h> static void check_fiemap(int fd) { struct fiemap *fiemap; unsigned long i, ex_size; fiemap = malloc(sizeof(struct fiemap)); if (!fiemap) { perror("malloc"); exit(1); } memset(fiemap, 0, sizeof(struct fiemap)); fiemap->fm_length = ~0; if (ioctl(fd, FS_IOC_FIEMAP, fiemap) == -1) { perror("ioctl(FIEMAP)"); exit(1); } ex_size = sizeof(struct fiemap_extent) * fiemap->fm_mapped_extents; fiemap = realloc(fiemap, sizeof(struct fiemap) + ex_size); if (!fiemap) { perror("realloc"); exit(1); } memset(fiemap->fm_extents, 0, ex_size); fiemap->fm_extent_count = fiemap->fm_mapped_extents; fiemap->fm_mapped_extents = 0; if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { perror("ioctl(FIEMAP)"); exit(1); } for (i = 0; i < fiemap->fm_mapped_extents; i++) { unsigned long start = fiemap->fm_extents[i].fe_logical; unsigned long end = fiemap->fm_extents[i].fe_logical + fiemap->fm_extents[i].fe_length; if (start <= 48*1024 && end > 48*1024) { printf("GOOD\n"); exit(0); } } printf("BAD:\n"); for (i = 0; i < fiemap->fm_mapped_extents; i++) { printf("%ld:\t%016llx %016llx\n", i, fiemap->fm_extents[i].fe_logical, fiemap->fm_extents[i].fe_length); } exit(1); } int main(int argc, char *argv[]) { char name[] = "mmap-lseek-XXXXXX"; int fd; char *p; fd = mkstemp(name); if (fd == -1) { perror("mkstemp"); exit(1); } /* Create a 48 kB file */ lseek(fd, 48 * 1024 - 1, SEEK_SET); if (write(fd, "\0", 1) != 1) { perror("write"); exit(1); } /* Map it, allowing space for it to grow */ p = mmap(NULL, 128 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (p == MAP_FAILED) { perror("mmap"); exit(1); } /* Write to the start of the file */ *(p) = 1; /* Extend the file */ lseek(fd, 128 * 1024 - 1, SEEK_SET); if (write(fd, "\0", 1) != 1) { perror("write"); exit(1); } /* write to the new space in the first page */ *(p + 49 * 1024) = 1; munmap(p, 128 * 1024); check_fiemap(fd); close(fd); return 0; }