Re: intel s3500 -- hot stuff

Julien Rouhaud <julien.rouhaud@xxxxxxxxxx> · Sat, 18 Jul 2015 12:03:21 +0200

On 10/12/2014 17:52, Jeff Janes wrote:
> On Tue, Dec 9, 2014 at 12:43 PM, Bruce Momjian <bruce@xxxxxxxxxx
> <mailto:bruce@xxxxxxxxxx>> wrote:
> 
>     On Mon, Dec  8, 2014 at 03:40:43PM -0600, Merlin Moncure wrote:
>     > >> Did not see consistent measurable gains > 256
>     > >> effective_io_concurrency.  Interesting that at setting of '2' (the
>     > >> lowest possible setting with the feature actually working) is
>     > >> pessimal.
>     > >
>     > > Very interesting.  When we added a per-tablespace random_page_cost,
>     > > there was a suggestion that we might want to add per-tablespace
>     > > effective_io_concurrency someday:
>     >
>     > What I'd really like to see is to have effective_io_concurrency work
>     > on other types of scans.  It's clearly a barn burner on fast storage
>     > and perhaps the default should be something other than '1'.  Spinning
>     > storage is clearly dead and ssd seem to really benefit from the posix
>     > readhead api.
> 
> 
> I haven't played much with SSD, but effective_io_concurrency can be a
> big win even on spinning disk.
>  
> 
> 
>     Well, the real question is knowing which blocks to request before
>     actually needing them.  With a bitmap scan, that is easy --- I am
>     unclear how to do it for other scans.  We already have kernel read-ahead
>     for sequential scans, and any index scan that hits multiple rows will
>     probably already be using a bitmap heap scan.
> 
> 
> If the index scan is used to provide ordering as well as selectivity
> than it will resist being converted to an bitmap scan. Also it won't
> convert to a bitmap scan solely to get credit for the use of
> effective_io_concurrency, as that setting doesn't enter into planning
> decisions.  
> 
> For a regular index scan, it should be easy to prefetch table blocks for
> all the tuples that will need to be retrieved based on the current index
> leaf page, for example.  Looking ahead across leaf page boundaries would
> be harder.
> 

I also think that having effective_io_concurrency for other nodes that
bitmap scan would be really great, but for now
having a per-tablespace effective_io_concurrency is simpler to implement
and will already help, so here's a patch to implement it.  I'm also
adding it to the next commitfest.

-- 
Julien Rouhaud
http://dalibo.com - http://dalibo.org

diff --git a/doc/src/sgml/ref/create_tablespace.sgml b/doc/src/sgml/ref/create_tablespace.sgml
index 5756c3e..cf08408 100644
--- a/doc/src/sgml/ref/create_tablespace.sgml
+++ b/doc/src/sgml/ref/create_tablespace.sgml
@@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
       <listitem>
        <para>
         A tablespace parameter to be set or reset.  Currently, the only
-        available parameters are <varname>seq_page_cost</> and
-        <varname>random_page_cost</>.  Setting either value for a particular
-        tablespace will override the planner's usual estimate of the cost of
-        reading pages from tables in that tablespace, as established by
-        the configuration parameters of the same name (see
-        <xref linkend="guc-seq-page-cost">,
-        <xref linkend="guc-random-page-cost">).  This may be useful if one
-        tablespace is located on a disk which is faster or slower than the
+        available parameters are <varname>seq_page_cost</>,
+        <varname>random_page_cost</> and <varname>effective_io_concurrency</>.
+        Setting either value for a particular tablespace will override the
+        planner's usual estimate of the cost of reading pages from tables in
+        that tablespace, as established by the configuration parameters of the
+        same name (see <xref linkend="guc-seq-page-cost">,
+        <xref linkend="guc-random-page-cost">,
+        <xref linkend="guc-effective-io-concurrency">).  This may be useful if
+        one tablespace is located on a disk which is faster or slower than the
         remainder of the I/O subsystem.
        </para>
       </listitem>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 8176b6a..fb24d74 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -232,6 +232,18 @@ static relopt_int intRelOpts[] =
 		},
 		-1, 64, MAX_KILOBYTES
 	},
+	{
+		{
+			"effective_io_concurrency",
+			"Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
+			RELOPT_KIND_TABLESPACE
+		},
+#ifdef USE_PREFETCH
+		1, 0, MAX_IO_CONCURRENCY
+#else
+		0, 0, 0
+#endif
+	},
 
 	/* list terminator */
 	{{NULL}}
@@ -1387,7 +1399,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
 	int			numoptions;
 	static const relopt_parse_elt tab[] = {
 		{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
-		{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}
+		{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
+		{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
 	};
 
 	options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 4597437..7ea77c8 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -42,8 +42,10 @@
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/predicate.h"
+#include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 #include "utils/snapmgr.h"
 #include "utils/tqual.h"
 
@@ -111,7 +113,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		node->tbmres = tbmres = NULL;
 
 #ifdef USE_PREFETCH
-		if (target_prefetch_pages > 0)
+		if (node->target_prefetch_pages > 0)
 		{
 			node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
 			node->prefetch_pages = 0;
@@ -188,10 +190,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			 * page/tuple, then to one after the second tuple is fetched, then
 			 * it doubles as later pages are fetched.
 			 */
-			if (node->prefetch_target >= target_prefetch_pages)
+			if (node->prefetch_target >= node->target_prefetch_pages)
 				 /* don't increase any further */ ;
-			else if (node->prefetch_target >= target_prefetch_pages / 2)
-				node->prefetch_target = target_prefetch_pages;
+			else if (node->prefetch_target >= node->target_prefetch_pages / 2)
+				node->prefetch_target = node->target_prefetch_pages;
 			else if (node->prefetch_target > 0)
 				node->prefetch_target *= 2;
 			else
@@ -211,7 +213,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			 * Try to prefetch at least a few pages even before we get to the
 			 * second page if we don't stop reading after the first tuple.
 			 */
-			if (node->prefetch_target < target_prefetch_pages)
+			if (node->prefetch_target < node->target_prefetch_pages)
 				node->prefetch_target++;
 #endif   /* USE_PREFETCH */
 		}
@@ -539,6 +541,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 {
 	BitmapHeapScanState *scanstate;
 	Relation	currentRelation;
+#ifdef USE_PREFETCH
+	int new_io_concurrency;
+#endif
 
 	/* check for unsupported flags */
 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -598,6 +603,25 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	 */
 	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
 
+#ifdef USE_PREFETCH
+	/* check if the effective_io_concurrency has been overloaded for the
+	 * tablespace storing the relation and compute the target_prefetch_pages,
+	 * or just get the current target_prefetch_pages
+	 */
+	new_io_concurrency = get_tablespace_io_concurrency(
+			currentRelation->rd_rel->reltablespace);
+
+
+	scanstate->target_prefetch_pages = target_prefetch_pages;
+
+	if (new_io_concurrency != effective_io_concurrency)
+	{
+		double prefetch_pages;
+	   if (compute_io_concurrency(new_io_concurrency, &prefetch_pages))
+			scanstate->target_prefetch_pages = rint(prefetch_pages);
+	}
+#endif
+
 	scanstate->ss.ss_currentRelation = currentRelation;
 
 	/*
@@ -634,3 +658,58 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	 */
 	return scanstate;
 }
+
+bool
+compute_io_concurrency(int io_concurrency, double *target_prefetch_pages)
+{
+	double		new_prefetch_pages = 0.0;
+	int			i;
+
+	/* make sure the io_concurrency value is correct, it may have been forced
+	 * with a pg_tablespace UPDATE
+	 */
+	if (io_concurrency > MAX_IO_CONCURRENCY)
+		io_concurrency = MAX_IO_CONCURRENCY;
+
+	/*----------
+	 * The user-visible GUC parameter is the number of drives (spindles),
+	 * which we need to translate to a number-of-pages-to-prefetch target.
+	 * The target value is stashed in *extra and then assigned to the actual
+	 * variable by assign_effective_io_concurrency.
+	 *
+	 * The expected number of prefetch pages needed to keep N drives busy is:
+	 *
+	 * drives |   I/O requests
+	 * -------+----------------
+	 *		1 |   1
+	 *		2 |   2/1 + 2/2 = 3
+	 *		3 |   3/1 + 3/2 + 3/3 = 5 1/2
+	 *		4 |   4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
+	 *		n |   n * H(n)
+	 *
+	 * This is called the "coupon collector problem" and H(n) is called the
+	 * harmonic series.  This could be approximated by n * ln(n), but for
+	 * reasonable numbers of drives we might as well just compute the series.
+	 *
+	 * Alternatively we could set the target to the number of pages necessary
+	 * so that the expected number of active spindles is some arbitrary
+	 * percentage of the total.  This sounds the same but is actually slightly
+	 * different.  The result ends up being ln(1-P)/ln((n-1)/n) where P is
+	 * that desired fraction.
+	 *
+	 * Experimental results show that both of these formulas aren't aggressive
+	 * enough, but we don't really have any better proposals.
+	 *
+	 * Note that if io_concurrency = 0 (disabled), we must set target = 0.
+	 *----------
+	 */
+
+
+	for (i = 1; i <= io_concurrency; i++)
+		new_prefetch_pages += (double) io_concurrency / (double) i;
+
+	*target_prefetch_pages = new_prefetch_pages;
+
+	/* This range check shouldn't fail, but let's be paranoid */
+	return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
+}
diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c
index 1a0c884..970d66b 100644
--- a/src/backend/utils/cache/spccache.c
+++ b/src/backend/utils/cache/spccache.c
@@ -23,7 +23,9 @@
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "optimizer/cost.h"
+#include "storage/bufmgr.h"
 #include "utils/catcache.h"
+#include "utils/guc.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 #include "utils/spccache.h"
@@ -198,3 +200,16 @@ get_tablespace_page_costs(Oid spcid,
 			*spc_seq_page_cost = spc->opts->seq_page_cost;
 	}
 }
+
+int
+get_tablespace_io_concurrency(Oid spcid)
+{
+	TableSpaceCacheEntry *spc = get_tablespace(spcid);
+
+	Assert(spc != NULL);
+
+	if (!spc->opts || spc->opts->effective_io_concurrency < 0)
+		return effective_io_concurrency;
+	else
+		return spc->opts->effective_io_concurrency;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1bed525..6d7c0ae 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -37,6 +37,7 @@
 #include "commands/vacuum.h"
 #include "commands/variable.h"
 #include "commands/trigger.h"
+#include "executor/nodeBitmapHeapscan.h"
 #include "funcapi.h"
 #include "libpq/auth.h"
 #include "libpq/be-fsstubs.h"
@@ -438,6 +439,8 @@ int			temp_file_limit = -1;
 
 int			num_temp_buffers = 1024;
 
+int			effective_io_concurrency = 0;
+
 char	   *cluster_name = "";
 char	   *ConfigFileName;
 char	   *HbaFileName;
@@ -490,7 +493,6 @@ static int	wal_block_size;
 static bool data_checksums;
 static int	wal_segment_size;
 static bool integer_datetimes;
-static int	effective_io_concurrency;
 static bool assert_enabled;
 
 /* should be static, but commands/variable.c needs to get at this */
@@ -2352,7 +2354,7 @@ static struct config_int ConfigureNamesInt[] =
 		},
 		&effective_io_concurrency,
 #ifdef USE_PREFETCH
-		1, 0, 1000,
+		1, 0, MAX_IO_CONCURRENCY,
 #else
 		0, 0, 0,
 #endif
@@ -9997,47 +9999,9 @@ static bool
 check_effective_io_concurrency(int *newval, void **extra, GucSource source)
 {
 #ifdef USE_PREFETCH
-	double		new_prefetch_pages = 0.0;
-	int			i;
-
-	/*----------
-	 * The user-visible GUC parameter is the number of drives (spindles),
-	 * which we need to translate to a number-of-pages-to-prefetch target.
-	 * The target value is stashed in *extra and then assigned to the actual
-	 * variable by assign_effective_io_concurrency.
-	 *
-	 * The expected number of prefetch pages needed to keep N drives busy is:
-	 *
-	 * drives |   I/O requests
-	 * -------+----------------
-	 *		1 |   1
-	 *		2 |   2/1 + 2/2 = 3
-	 *		3 |   3/1 + 3/2 + 3/3 = 5 1/2
-	 *		4 |   4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
-	 *		n |   n * H(n)
-	 *
-	 * This is called the "coupon collector problem" and H(n) is called the
-	 * harmonic series.  This could be approximated by n * ln(n), but for
-	 * reasonable numbers of drives we might as well just compute the series.
-	 *
-	 * Alternatively we could set the target to the number of pages necessary
-	 * so that the expected number of active spindles is some arbitrary
-	 * percentage of the total.  This sounds the same but is actually slightly
-	 * different.  The result ends up being ln(1-P)/ln((n-1)/n) where P is
-	 * that desired fraction.
-	 *
-	 * Experimental results show that both of these formulas aren't aggressive
-	 * enough, but we don't really have any better proposals.
-	 *
-	 * Note that if *newval = 0 (disabled), we must set target = 0.
-	 *----------
-	 */
-
-	for (i = 1; i <= *newval; i++)
-		new_prefetch_pages += (double) *newval / (double) i;
+	double		new_prefetch_pages;
 
-	/* This range check shouldn't fail, but let's be paranoid */
-	if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
+	if (compute_io_concurrency(*newval, &new_prefetch_pages))
 	{
 		int		   *myextra = (int *) guc_malloc(ERROR, sizeof(int));
 
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 0683548..36b8a75 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1870,7 +1870,7 @@ psql_completion(const char *text, int start, int end)
 			 pg_strcasecmp(prev_wd, "(") == 0)
 	{
 		static const char *const list_TABLESPACEOPTIONS[] =
-		{"seq_page_cost", "random_page_cost", NULL};
+		{"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
 
 		COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
 	}
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h
index 6b928a5..be9582a 100644
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
 	int32		vl_len_;		/* varlena header (do not touch directly!) */
 	float8		random_page_cost;
 	float8		seq_page_cost;
+	int			effective_io_concurrency;
 } TableSpaceOpts;
 
 extern Oid	CreateTableSpace(CreateTableSpaceStmt *stmt);
diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h
index 3183376..698fcf5 100644
--- a/src/include/executor/nodeBitmapHeapscan.h
+++ b/src/include/executor/nodeBitmapHeapscan.h
@@ -20,5 +20,6 @@ extern BitmapHeapScanState *ExecInitBitmapHeapScan(BitmapHeapScan *node, EState
 extern TupleTableSlot *ExecBitmapHeapScan(BitmapHeapScanState *node);
 extern void ExecEndBitmapHeapScan(BitmapHeapScanState *node);
 extern void ExecReScanBitmapHeapScan(BitmapHeapScanState *node);
+extern bool compute_io_concurrency(int io_concurrency, double *target_prefetch_pages);
 
 #endif   /* NODEBITMAPHEAPSCAN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 541ee18..c6d48fa 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1410,15 +1410,16 @@ typedef struct BitmapIndexScanState
 /* ----------------
  *	 BitmapHeapScanState information
  *
- *		bitmapqualorig	   execution state for bitmapqualorig expressions
- *		tbm				   bitmap obtained from child index scan(s)
- *		tbmiterator		   iterator for scanning current pages
- *		tbmres			   current-page data
- *		exact_pages		   total number of exact pages retrieved
- *		lossy_pages		   total number of lossy pages retrieved
- *		prefetch_iterator  iterator for prefetching ahead of current page
- *		prefetch_pages	   # pages prefetch iterator is ahead of current
- *		prefetch_target    target prefetch distance
+ *		bitmapqualorig			execution state for bitmapqualorig expressions
+ *		tbm						bitmap obtained from child index scan(s)
+ *		tbmiterator				iterator for scanning current pages
+ *		tbmres					current-page data
+ *		exact_pages				total number of exact pages retrieved
+ *		lossy_pages				total number of lossy pages retrieved
+ *		prefetch_iterator		iterator for prefetching ahead of current page
+ *		prefetch_pages			# pages prefetch iterator is ahead of current
+ *		prefetch_target			target prefetch distance
+ *		target_prefetch_pages	may be overloaded by tablespace setting
  * ----------------
  */
 typedef struct BitmapHeapScanState
@@ -1433,6 +1434,9 @@ typedef struct BitmapHeapScanState
 	TBMIterator *prefetch_iterator;
 	int			prefetch_pages;
 	int			prefetch_target;
+#ifdef USE_PREFETCH
+	int			target_prefetch_pages;
+#endif
 } BitmapHeapScanState;
 
 /* ----------------
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index dc167f9..57008fc 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -26,6 +26,9 @@
 #define MAX_KILOBYTES	(INT_MAX / 1024)
 #endif
 
+/* upper limit for effective_io_concurrency */
+#define MAX_IO_CONCURRENCY 1000
+
 /*
  * Automatic configuration file name for ALTER SYSTEM.
  * This file will be used to store values of configuration parameters
@@ -256,6 +259,8 @@ extern int	temp_file_limit;
 
 extern int	num_temp_buffers;
 
+extern int	effective_io_concurrency;
+
 extern char *cluster_name;
 extern char *ConfigFileName;
 extern char *HbaFileName;
diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h
index bdd1c0f..e5b9769 100644
--- a/src/include/utils/spccache.h
+++ b/src/include/utils/spccache.h
@@ -15,5 +15,6 @@
 
 void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
 						  float8 *spc_seq_page_cost);
+int get_tablespace_io_concurrency(Oid spcid);
 
 #endif   /* SPCCACHE_H */
-- 
Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-performance