On Sunday 07 February 2010 19:27:02 Andres Freund wrote: > On Sunday 07 February 2010 19:23:10 Robert Haas wrote: > > On Sun, Feb 7, 2010 at 11:24 AM, Tom Lane <tgl@xxxxxxxxxxxxx> wrote: > > > Greg Smith <greg@xxxxxxxxxxxxxxx> writes: > > >> This is turning into yet another one of those situations where > > >> something simple and useful is being killed by trying to generalize > > >> it way more than it needs to be, given its current goals and its lack > > >> of external interfaces. There's no catversion bump or API breakage > > >> to hinder future refactoring if this isn't optimally designed > > >> internally from day one. > > > > > > I agree that it's too late in the cycle for any major redesign of the > > > patch. But is it too much to ask to use a less confusing name for the > > > function? > > > > +1. Let's just rename the thing, add some comments, and call it good. > > Will post a updated patch in the next hours unless somebody beats me too > it. Here we go. I left the name at my suggestion pg_fsync_prepare instead of Tom's prepare_for_fsync because it seemed more consistend with the naming in the rest of the file. Obviously feel free to adjust. I personally think the fsync on the directory should be added to the stable branches - other opinions? If wanted I can prepare patches for that. Andres
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 7ffa2eb..bc5753a 100644 *** a/src/backend/storage/file/fd.c --- b/src/backend/storage/file/fd.c *************** pg_fdatasync(int fd) *** 320,325 **** --- 320,361 ---- } /* + * pg_fsync_prepare --- try to make a later fsync on the same file faster + * + * A call to this function does not guarantee anything! + * + * The idea is to tell the kernel to write out its cache so that a + * fsync later on has less to write out synchronously. This allows + * that write requests get reordered more freely. + * + * In the current implementation this has the additional effect of + * dropping the cache in that region and thus can be used to avoid + * cache poisoning. This may or may not be wanted. + * + * XXX: Ideally this API would use sync_file_range (or similar on + * platforms other than linux) and a seperate one for cache + * control. We are not there yet. + * + * Look at the thread below 200912282354.51892.andres@xxxxxxxxxxx in + * pgsql-hackers for a longer discussion. + */ + int + pg_fsync_prepare(int fd, off_t offset, off_t amount) + { + if (enableFsync) + { + #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); + #else + return 0; + #endif + } + else + return 0; + } + + + /* * InitFileAccess --- initialize this module during backend startup * * This is called during either normal or standalone backend start. diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 21cb024..b1a4b49 100644 *** a/src/include/storage/fd.h --- b/src/include/storage/fd.h *************** extern int pg_fsync_no_writethrough(int *** 99,104 **** --- 99,106 ---- extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); + extern int prepare_for_fsync(int fd, off_t offset, off_t amount); + /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp" #define PG_TEMP_FILE_PREFIX "pgsql_tmp" diff --git a/src/port/copydir.c b/src/port/copydir.c index 72fbf36..eef3cfb 100644 *** a/src/port/copydir.c --- b/src/port/copydir.c *************** *** 37,42 **** --- 37,43 ---- static void copy_file(char *fromfile, char *tofile); + static void fsync_fname(char *fname); /* *************** copydir(char *fromdir, char *todir, bool *** 64,69 **** --- 65,73 ---- (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fromdir))); + /* + * Copy all the files + */ while ((xlde = ReadDir(xldir, fromdir)) != NULL) { struct stat fst; *************** copydir(char *fromdir, char *todir, bool *** 89,96 **** --- 93,127 ---- else if (S_ISREG(fst.st_mode)) copy_file(fromfile, tofile); } + FreeDir(xldir); + + /* + * Be paranoid here and fsync all files to ensure we catch problems. + */ + xldir = AllocateDir(fromdir); + if (xldir == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", fromdir))); + + while ((xlde = ReadDir(xldir, fromdir)) != NULL) + { + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); + fsync_fname(tofile); + } FreeDir(xldir); + + /* It's important to fsync the destination directory itself as + * individual file fsyncs don't guarantee that the directory entry + * for the file is synced. Recent versions of ext4 have made the + * window much wider but it's been true for ext3 and other + * filesyetems in the past + */ + fsync_fname(todir); } /* *************** copy_file(char *fromfile, char *tofile) *** 103,108 **** --- 134,140 ---- int srcfd; int dstfd; int nbytes; + off_t offset; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) *************** copy_file(char *fromfile, char *tofile) *** 128,134 **** /* * Do the data copying. */ ! for (;;) { nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) --- 160,166 ---- /* * Do the data copying. */ ! for (offset=0; ; offset+=nbytes) { nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) *************** copy_file(char *fromfile, char *tofile) *** 147,161 **** (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } - } ! /* ! * Be paranoid here to ensure we catch problems. ! */ ! if (pg_fsync(dstfd) != 0) ! ereport(ERROR, ! (errcode_for_file_access(), ! errmsg("could not fsync file \"%s\": %m", tofile))); if (close(dstfd)) ereport(ERROR, --- 179,197 ---- (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } ! /* ! * Start flushing the dirty memory early. ! * ! * Empirical results seem to show a big speed benefit on ! * recent Linux kernels. ! * As a side effect this avoids cache pollution by throwing ! * away the cache of the new file. ! * XXX: It might be more sensible to do that on the origin ! * than the source. ! */ ! pg_fsync_prepare(dstfd, offset, nbytes); ! } if (close(dstfd)) ereport(ERROR, *************** copy_file(char *fromfile, char *tofile) *** 166,168 **** --- 202,226 ---- pfree(buffer); } + + /* + * fsync a file + */ + static void + fsync_fname(char *fname) + { + int fd = BasicOpenFile(fname, + O_RDONLY | PG_BINARY, + S_IRUSR | S_IWUSR); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + close(fd); + }
-- Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-performance