On 10/12/2014 17:52, Jeff Janes wrote: > On Tue, Dec 9, 2014 at 12:43 PM, Bruce Momjian <bruce@xxxxxxxxxx > <mailto:bruce@xxxxxxxxxx>> wrote: > > On Mon, Dec 8, 2014 at 03:40:43PM -0600, Merlin Moncure wrote: > > >> Did not see consistent measurable gains > 256 > > >> effective_io_concurrency. Interesting that at setting of '2' (the > > >> lowest possible setting with the feature actually working) is > > >> pessimal. > > > > > > Very interesting. When we added a per-tablespace random_page_cost, > > > there was a suggestion that we might want to add per-tablespace > > > effective_io_concurrency someday: > > > > What I'd really like to see is to have effective_io_concurrency work > > on other types of scans. It's clearly a barn burner on fast storage > > and perhaps the default should be something other than '1'. Spinning > > storage is clearly dead and ssd seem to really benefit from the posix > > readhead api. > > > I haven't played much with SSD, but effective_io_concurrency can be a > big win even on spinning disk. > > > > Well, the real question is knowing which blocks to request before > actually needing them. With a bitmap scan, that is easy --- I am > unclear how to do it for other scans. We already have kernel read-ahead > for sequential scans, and any index scan that hits multiple rows will > probably already be using a bitmap heap scan. > > > If the index scan is used to provide ordering as well as selectivity > than it will resist being converted to an bitmap scan. Also it won't > convert to a bitmap scan solely to get credit for the use of > effective_io_concurrency, as that setting doesn't enter into planning > decisions. > > For a regular index scan, it should be easy to prefetch table blocks for > all the tuples that will need to be retrieved based on the current index > leaf page, for example. Looking ahead across leaf page boundaries would > be harder. > I also think that having effective_io_concurrency for other nodes that bitmap scan would be really great, but for now having a per-tablespace effective_io_concurrency is simpler to implement and will already help, so here's a patch to implement it. I'm also adding it to the next commitfest. -- Julien Rouhaud http://dalibo.com - http://dalibo.org
diff --git a/doc/src/sgml/ref/create_tablespace.sgml b/doc/src/sgml/ref/create_tablespace.sgml index 5756c3e..cf08408 100644 --- a/doc/src/sgml/ref/create_tablespace.sgml +++ b/doc/src/sgml/ref/create_tablespace.sgml @@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> <listitem> <para> A tablespace parameter to be set or reset. Currently, the only - available parameters are <varname>seq_page_cost</> and - <varname>random_page_cost</>. Setting either value for a particular - tablespace will override the planner's usual estimate of the cost of - reading pages from tables in that tablespace, as established by - the configuration parameters of the same name (see - <xref linkend="guc-seq-page-cost">, - <xref linkend="guc-random-page-cost">). This may be useful if one - tablespace is located on a disk which is faster or slower than the + available parameters are <varname>seq_page_cost</>, + <varname>random_page_cost</> and <varname>effective_io_concurrency</>. + Setting either value for a particular tablespace will override the + planner's usual estimate of the cost of reading pages from tables in + that tablespace, as established by the configuration parameters of the + same name (see <xref linkend="guc-seq-page-cost">, + <xref linkend="guc-random-page-cost">, + <xref linkend="guc-effective-io-concurrency">). This may be useful if + one tablespace is located on a disk which is faster or slower than the remainder of the I/O subsystem. </para> </listitem> diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 8176b6a..fb24d74 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -232,6 +232,18 @@ static relopt_int intRelOpts[] = }, -1, 64, MAX_KILOBYTES }, + { + { + "effective_io_concurrency", + "Number of simultaneous requests that can be handled efficiently by the disk subsystem.", + RELOPT_KIND_TABLESPACE + }, +#ifdef USE_PREFETCH + 1, 0, MAX_IO_CONCURRENCY +#else + 0, 0, 0 +#endif + }, /* list terminator */ {{NULL}} @@ -1387,7 +1399,8 @@ tablespace_reloptions(Datum reloptions, bool validate) int numoptions; static const relopt_parse_elt tab[] = { {"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)}, - {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)} + {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}, + {"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)} }; options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 4597437..7ea77c8 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -42,8 +42,10 @@ #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/predicate.h" +#include "utils/guc.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/spccache.h" #include "utils/snapmgr.h" #include "utils/tqual.h" @@ -111,7 +113,7 @@ BitmapHeapNext(BitmapHeapScanState *node) node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH - if (target_prefetch_pages > 0) + if (node->target_prefetch_pages > 0) { node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); node->prefetch_pages = 0; @@ -188,10 +190,10 @@ BitmapHeapNext(BitmapHeapScanState *node) * page/tuple, then to one after the second tuple is fetched, then * it doubles as later pages are fetched. */ - if (node->prefetch_target >= target_prefetch_pages) + if (node->prefetch_target >= node->target_prefetch_pages) /* don't increase any further */ ; - else if (node->prefetch_target >= target_prefetch_pages / 2) - node->prefetch_target = target_prefetch_pages; + else if (node->prefetch_target >= node->target_prefetch_pages / 2) + node->prefetch_target = node->target_prefetch_pages; else if (node->prefetch_target > 0) node->prefetch_target *= 2; else @@ -211,7 +213,7 @@ BitmapHeapNext(BitmapHeapScanState *node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (node->prefetch_target < target_prefetch_pages) + if (node->prefetch_target < node->target_prefetch_pages) node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -539,6 +541,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) { BitmapHeapScanState *scanstate; Relation currentRelation; +#ifdef USE_PREFETCH + int new_io_concurrency; +#endif /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -598,6 +603,25 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) */ currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); +#ifdef USE_PREFETCH + /* check if the effective_io_concurrency has been overloaded for the + * tablespace storing the relation and compute the target_prefetch_pages, + * or just get the current target_prefetch_pages + */ + new_io_concurrency = get_tablespace_io_concurrency( + currentRelation->rd_rel->reltablespace); + + + scanstate->target_prefetch_pages = target_prefetch_pages; + + if (new_io_concurrency != effective_io_concurrency) + { + double prefetch_pages; + if (compute_io_concurrency(new_io_concurrency, &prefetch_pages)) + scanstate->target_prefetch_pages = rint(prefetch_pages); + } +#endif + scanstate->ss.ss_currentRelation = currentRelation; /* @@ -634,3 +658,58 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) */ return scanstate; } + +bool +compute_io_concurrency(int io_concurrency, double *target_prefetch_pages) +{ + double new_prefetch_pages = 0.0; + int i; + + /* make sure the io_concurrency value is correct, it may have been forced + * with a pg_tablespace UPDATE + */ + if (io_concurrency > MAX_IO_CONCURRENCY) + io_concurrency = MAX_IO_CONCURRENCY; + + /*---------- + * The user-visible GUC parameter is the number of drives (spindles), + * which we need to translate to a number-of-pages-to-prefetch target. + * The target value is stashed in *extra and then assigned to the actual + * variable by assign_effective_io_concurrency. + * + * The expected number of prefetch pages needed to keep N drives busy is: + * + * drives | I/O requests + * -------+---------------- + * 1 | 1 + * 2 | 2/1 + 2/2 = 3 + * 3 | 3/1 + 3/2 + 3/3 = 5 1/2 + * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3 + * n | n * H(n) + * + * This is called the "coupon collector problem" and H(n) is called the + * harmonic series. This could be approximated by n * ln(n), but for + * reasonable numbers of drives we might as well just compute the series. + * + * Alternatively we could set the target to the number of pages necessary + * so that the expected number of active spindles is some arbitrary + * percentage of the total. This sounds the same but is actually slightly + * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is + * that desired fraction. + * + * Experimental results show that both of these formulas aren't aggressive + * enough, but we don't really have any better proposals. + * + * Note that if io_concurrency = 0 (disabled), we must set target = 0. + *---------- + */ + + + for (i = 1; i <= io_concurrency; i++) + new_prefetch_pages += (double) io_concurrency / (double) i; + + *target_prefetch_pages = new_prefetch_pages; + + /* This range check shouldn't fail, but let's be paranoid */ + return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX); +} diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c index 1a0c884..970d66b 100644 --- a/src/backend/utils/cache/spccache.c +++ b/src/backend/utils/cache/spccache.c @@ -23,7 +23,9 @@ #include "commands/tablespace.h" #include "miscadmin.h" #include "optimizer/cost.h" +#include "storage/bufmgr.h" #include "utils/catcache.h" +#include "utils/guc.h" #include "utils/hsearch.h" #include "utils/inval.h" #include "utils/spccache.h" @@ -198,3 +200,16 @@ get_tablespace_page_costs(Oid spcid, *spc_seq_page_cost = spc->opts->seq_page_cost; } } + +int +get_tablespace_io_concurrency(Oid spcid) +{ + TableSpaceCacheEntry *spc = get_tablespace(spcid); + + Assert(spc != NULL); + + if (!spc->opts || spc->opts->effective_io_concurrency < 0) + return effective_io_concurrency; + else + return spc->opts->effective_io_concurrency; +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 1bed525..6d7c0ae 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -37,6 +37,7 @@ #include "commands/vacuum.h" #include "commands/variable.h" #include "commands/trigger.h" +#include "executor/nodeBitmapHeapscan.h" #include "funcapi.h" #include "libpq/auth.h" #include "libpq/be-fsstubs.h" @@ -438,6 +439,8 @@ int temp_file_limit = -1; int num_temp_buffers = 1024; +int effective_io_concurrency = 0; + char *cluster_name = ""; char *ConfigFileName; char *HbaFileName; @@ -490,7 +493,6 @@ static int wal_block_size; static bool data_checksums; static int wal_segment_size; static bool integer_datetimes; -static int effective_io_concurrency; static bool assert_enabled; /* should be static, but commands/variable.c needs to get at this */ @@ -2352,7 +2354,7 @@ static struct config_int ConfigureNamesInt[] = }, &effective_io_concurrency, #ifdef USE_PREFETCH - 1, 0, 1000, + 1, 0, MAX_IO_CONCURRENCY, #else 0, 0, 0, #endif @@ -9997,47 +9999,9 @@ static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source) { #ifdef USE_PREFETCH - double new_prefetch_pages = 0.0; - int i; - - /*---------- - * The user-visible GUC parameter is the number of drives (spindles), - * which we need to translate to a number-of-pages-to-prefetch target. - * The target value is stashed in *extra and then assigned to the actual - * variable by assign_effective_io_concurrency. - * - * The expected number of prefetch pages needed to keep N drives busy is: - * - * drives | I/O requests - * -------+---------------- - * 1 | 1 - * 2 | 2/1 + 2/2 = 3 - * 3 | 3/1 + 3/2 + 3/3 = 5 1/2 - * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3 - * n | n * H(n) - * - * This is called the "coupon collector problem" and H(n) is called the - * harmonic series. This could be approximated by n * ln(n), but for - * reasonable numbers of drives we might as well just compute the series. - * - * Alternatively we could set the target to the number of pages necessary - * so that the expected number of active spindles is some arbitrary - * percentage of the total. This sounds the same but is actually slightly - * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is - * that desired fraction. - * - * Experimental results show that both of these formulas aren't aggressive - * enough, but we don't really have any better proposals. - * - * Note that if *newval = 0 (disabled), we must set target = 0. - *---------- - */ - - for (i = 1; i <= *newval; i++) - new_prefetch_pages += (double) *newval / (double) i; + double new_prefetch_pages; - /* This range check shouldn't fail, but let's be paranoid */ - if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX) + if (compute_io_concurrency(*newval, &new_prefetch_pages)) { int *myextra = (int *) guc_malloc(ERROR, sizeof(int)); diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 0683548..36b8a75 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -1870,7 +1870,7 @@ psql_completion(const char *text, int start, int end) pg_strcasecmp(prev_wd, "(") == 0) { static const char *const list_TABLESPACEOPTIONS[] = - {"seq_page_cost", "random_page_cost", NULL}; + {"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL}; COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS); } diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h index 6b928a5..be9582a 100644 --- a/src/include/commands/tablespace.h +++ b/src/include/commands/tablespace.h @@ -39,6 +39,7 @@ typedef struct TableSpaceOpts int32 vl_len_; /* varlena header (do not touch directly!) */ float8 random_page_cost; float8 seq_page_cost; + int effective_io_concurrency; } TableSpaceOpts; extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt); diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h index 3183376..698fcf5 100644 --- a/src/include/executor/nodeBitmapHeapscan.h +++ b/src/include/executor/nodeBitmapHeapscan.h @@ -20,5 +20,6 @@ extern BitmapHeapScanState *ExecInitBitmapHeapScan(BitmapHeapScan *node, EState extern TupleTableSlot *ExecBitmapHeapScan(BitmapHeapScanState *node); extern void ExecEndBitmapHeapScan(BitmapHeapScanState *node); extern void ExecReScanBitmapHeapScan(BitmapHeapScanState *node); +extern bool compute_io_concurrency(int io_concurrency, double *target_prefetch_pages); #endif /* NODEBITMAPHEAPSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 541ee18..c6d48fa 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1410,15 +1410,16 @@ typedef struct BitmapIndexScanState /* ---------------- * BitmapHeapScanState information * - * bitmapqualorig execution state for bitmapqualorig expressions - * tbm bitmap obtained from child index scan(s) - * tbmiterator iterator for scanning current pages - * tbmres current-page data - * exact_pages total number of exact pages retrieved - * lossy_pages total number of lossy pages retrieved - * prefetch_iterator iterator for prefetching ahead of current page - * prefetch_pages # pages prefetch iterator is ahead of current - * prefetch_target target prefetch distance + * bitmapqualorig execution state for bitmapqualorig expressions + * tbm bitmap obtained from child index scan(s) + * tbmiterator iterator for scanning current pages + * tbmres current-page data + * exact_pages total number of exact pages retrieved + * lossy_pages total number of lossy pages retrieved + * prefetch_iterator iterator for prefetching ahead of current page + * prefetch_pages # pages prefetch iterator is ahead of current + * prefetch_target target prefetch distance + * target_prefetch_pages may be overloaded by tablespace setting * ---------------- */ typedef struct BitmapHeapScanState @@ -1433,6 +1434,9 @@ typedef struct BitmapHeapScanState TBMIterator *prefetch_iterator; int prefetch_pages; int prefetch_target; +#ifdef USE_PREFETCH + int target_prefetch_pages; +#endif } BitmapHeapScanState; /* ---------------- diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index dc167f9..57008fc 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -26,6 +26,9 @@ #define MAX_KILOBYTES (INT_MAX / 1024) #endif +/* upper limit for effective_io_concurrency */ +#define MAX_IO_CONCURRENCY 1000 + /* * Automatic configuration file name for ALTER SYSTEM. * This file will be used to store values of configuration parameters @@ -256,6 +259,8 @@ extern int temp_file_limit; extern int num_temp_buffers; +extern int effective_io_concurrency; + extern char *cluster_name; extern char *ConfigFileName; extern char *HbaFileName; diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h index bdd1c0f..e5b9769 100644 --- a/src/include/utils/spccache.h +++ b/src/include/utils/spccache.h @@ -15,5 +15,6 @@ void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost, float8 *spc_seq_page_cost); +int get_tablespace_io_concurrency(Oid spcid); #endif /* SPCCACHE_H */
-- Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-performance