On Mon, Jan 18, 2010 at 4:35 PM, Greg Stark <gsstark@xxxxxxx> wrote: > Looking at this patch for the commitfest I have a few questions. So I've touched this patch up a bit: 1) moved the posix_fadvise call to a new fd.c function pg_fsync_start(fd,offset,nbytes) which initiates an fsync without waiting on it. Currently it's only implemented with posix_fadvise(DONT_NEED) but I want to look into using sync_file_range in the future -- it looks like this call might be good enough for our checkpoints. 2) advised each 64k chunk as we write it which should avoid poisoning the cache if you do a large create database on an active system. 3) added the promised but afaict missing fsync of the directory -- i think we should actually backpatch this. Barring any objections shall I commit it like this? -- greg -- greg
*** a/src/backend/storage/file/fd.c --- b/src/backend/storage/file/fd.c *************** *** 319,324 **** pg_fdatasync(int fd) --- 319,340 ---- return 0; } + int + pg_fsync_start(int fd, off_t offset, off_t amount) + { + if (enableFsync) + { + #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); + #else + return 0; + #endif + } + else + return 0; + } + + /* * InitFileAccess --- initialize this module during backend startup * *** a/src/include/storage/fd.h --- b/src/include/storage/fd.h *************** *** 98,103 **** extern int pg_fsync(int fd); --- 98,104 ---- extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); + extern int pg_fsync_start(int fd, off_t offset, off_t amount); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp" *** a/src/port/copydir.c --- b/src/port/copydir.c *************** *** 37,42 **** --- 37,43 ---- static void copy_file(char *fromfile, char *tofile); + static void fsync_fname(char *fname); /* *************** *** 64,69 **** copydir(char *fromdir, char *todir, bool recurse) --- 65,73 ---- (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fromdir))); + /* + * Copy all the files + */ while ((xlde = ReadDir(xldir, fromdir)) != NULL) { struct stat fst; *************** *** 89,96 **** copydir(char *fromdir, char *todir, bool recurse) --- 93,127 ---- else if (S_ISREG(fst.st_mode)) copy_file(fromfile, tofile); } + FreeDir(xldir); + + /* + * Be paranoid here and fsync all files to ensure we catch problems. + */ + xldir = AllocateDir(fromdir); + if (xldir == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", fromdir))); + + while ((xlde = ReadDir(xldir, fromdir)) != NULL) + { + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); + fsync_fname(tofile); + } FreeDir(xldir); + + /* It's important to fsync the destination directory itself as + * individual file fsyncs don't guarantee that the directory entry + * for the file is synced. Recent versions of ext4 have made the + * window much wider but it's been true for ext3 and other + * filesyetems in the past + */ + fsync_fname(todir); } /* *************** *** 103,108 **** copy_file(char *fromfile, char *tofile) --- 134,140 ---- int srcfd; int dstfd; int nbytes; + off_t offset; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) *************** *** 128,134 **** copy_file(char *fromfile, char *tofile) /* * Do the data copying. */ ! for (;;) { nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) --- 160,166 ---- /* * Do the data copying. */ ! for (offset=0; ; offset+=nbytes) { nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) *************** *** 147,161 **** copy_file(char *fromfile, char *tofile) (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } - } ! /* ! * Be paranoid here to ensure we catch problems. ! */ ! if (pg_fsync(dstfd) != 0) ! ereport(ERROR, ! (errcode_for_file_access(), ! errmsg("could not fsync file \"%s\": %m", tofile))); if (close(dstfd)) ereport(ERROR, --- 179,195 ---- (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } ! /* ! * Avoid poisoning the cache with the entire database just ! * like we do for large sequential scans. Actually that's a ! * white lie, the real motivation for this is that it might ! * urge the kernel to flush these buffers when convenient ! * before we get to the later fsyncs. Empirical results seem ! * to show a big speed benefit on recent Linux kernels. ! */ ! pg_fsync_start(dstfd, offset, nbytes); ! } if (close(dstfd)) ereport(ERROR, *************** *** 166,168 **** copy_file(char *fromfile, char *tofile) --- 200,224 ---- pfree(buffer); } + + /* + * fsync a file + */ + static void + fsync_fname(char *fname) + { + int fd = BasicOpenFile(fname, + O_RDONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + close(fd); + }
-- Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-performance