[postgis-tickets] r17600 - Fast Hilbert btree.

Darafei komzpa at gmail.com
Sun Jul 14 07:24:19 PDT 2019


Author: komzpa
Date: 2019-07-14 07:24:18 -0700 (Sun, 14 Jul 2019)
New Revision: 17600

Modified:
   trunk/NEWS
   trunk/liblwgeom/gbox.c
   trunk/liblwgeom/gserialized.c
   trunk/liblwgeom/liblwgeom.h.in
   trunk/liblwgeom/liblwgeom_internal.h
   trunk/liblwgeom/lwinline.h
   trunk/liblwgeom/lwtree.c
   trunk/libpgcommon/gserialized_gist.h
   trunk/postgis/lwgeom_btree.c
   trunk/postgis/postgis.sql.in
Log:
Fast Hilbert btree.

Requires btree indexes to be recreated.

Closes https://github.com/postgis/postgis/pull/436
Closes #3883


Modified: trunk/NEWS
===================================================================
--- trunk/NEWS	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/NEWS	2019-07-14 14:24:18 UTC (rev 17600)
@@ -110,6 +110,9 @@
   - #4442, raster2pgsql now skips NODATA tiles. Use -k option if you still want
            them in database for some reason. (Darafei Praliaskouski)
   - #4433, 32-bit hash fix (requires reindexing hash(geometry) indexes) (Raúl Marín)
+  - #3383, Sorting now uses Hilbert curve and Postgres Abbreviated Compare.
+           You need to REINDEX your btree indexes if you had them.
+           (Darafei Praliaskouski)
 
 * New Features *
   - #2902, postgis_geos_noop (Sandro Santilli)

Modified: trunk/liblwgeom/gbox.c
===================================================================
--- trunk/liblwgeom/gbox.c	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/gbox.c	2019-07-14 14:24:18 UTC (rev 17600)
@@ -724,32 +724,105 @@
 	}
 }
 
-uint64_t uint32_interleave_2(uint32_t u1, uint32_t u2)
+inline static uint64_t
+uint64_interleave_2(uint64_t x, uint64_t y)
 {
-    uint64_t x = u1;
-    uint64_t y = u2;
-    int i;
+	x = (x | (x << 16)) & 0x0000FFFF0000FFFF;
+	x = (x | (x << 8)) & 0x00FF00FF00FF00FF;
+	x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0F;
+	x = (x | (x << 2)) & 0x3333333333333333;
+	x = (x | (x << 1)) & 0x5555555555555555;
 
-    static uint64_t B[5] =
-    {
-        0x5555555555555555,
-        0x3333333333333333,
-        0x0F0F0F0F0F0F0F0F,
-        0x00FF00FF00FF00FF,
-        0x0000FFFF0000FFFF
-    };
-    static uint64_t S[5] = { 1, 2, 4, 8, 16 };
+	y = (y | (y << 16)) & 0x0000FFFF0000FFFF;
+	y = (y | (y << 8)) & 0x00FF00FF00FF00FF;
+	y = (y | (y << 4)) & 0x0F0F0F0F0F0F0F0F;
+	y = (y | (y << 2)) & 0x3333333333333333;
+	y = (y | (y << 1)) & 0x5555555555555555;
 
-    for ( i = 4; i >= 0; i-- )
-    {
-        x = (x | (x << S[i])) & B[i];
-        y = (y | (y << S[i])) & B[i];
-    }
+	return x | (y << 1);
+}
 
-    return x | (y << 1);
+/* Based on https://github.com/rawrunprotected/hilbert_curves Public Domain code */
+inline static uint64_t
+uint32_hilbert(uint32_t px, uint32_t py)
+{
+	uint64_t x = px;
+	uint64_t y = py;
+
+	uint64_t A, B, C, D;
+
+	// Initial prefix scan round, prime with x and y
+	{
+		uint64_t a = x ^ y;
+		uint64_t b = 0xFFFFFFFF ^ a;
+		uint64_t c = 0xFFFFFFFF ^ (x | y);
+		uint64_t d = x & (y ^ 0xFFFFFFFF);
+
+		A = a | (b >> 1);
+		B = (a >> 1) ^ a;
+		C = ((c >> 1) ^ (b & (d >> 1))) ^ c;
+		D = ((a & (c >> 1)) ^ (d >> 1)) ^ d;
+	}
+
+	{
+		uint64_t a = A;
+		uint64_t b = B;
+		uint64_t c = C;
+		uint64_t d = D;
+
+		A = ((a & (a >> 2)) ^ (b & (b >> 2)));
+		B = ((a & (b >> 2)) ^ (b & ((a ^ b) >> 2)));
+		C ^= ((a & (c >> 2)) ^ (b & (d >> 2)));
+		D ^= ((b & (c >> 2)) ^ ((a ^ b) & (d >> 2)));
+	}
+
+	{
+		uint64_t a = A;
+		uint64_t b = B;
+		uint64_t c = C;
+		uint64_t d = D;
+
+		A = ((a & (a >> 4)) ^ (b & (b >> 4)));
+		B = ((a & (b >> 4)) ^ (b & ((a ^ b) >> 4)));
+		C ^= ((a & (c >> 4)) ^ (b & (d >> 4)));
+		D ^= ((b & (c >> 4)) ^ ((a ^ b) & (d >> 4)));
+	}
+
+	{
+		uint64_t a = A;
+		uint64_t b = B;
+		uint64_t c = C;
+		uint64_t d = D;
+
+		A = ((a & (a >> 8)) ^ (b & (b >> 8)));
+		B = ((a & (b >> 8)) ^ (b & ((a ^ b) >> 8)));
+		C ^= ((a & (c >> 8)) ^ (b & (d >> 8)));
+		D ^= ((b & (c >> 8)) ^ ((a ^ b) & (d >> 8)));
+	}
+
+	{
+		uint64_t a = A;
+		uint64_t b = B;
+		uint64_t c = C;
+		uint64_t d = D;
+
+		C ^= ((a & (c >> 16)) ^ (b & (d >> 16)));
+		D ^= ((b & (c >> 16)) ^ ((a ^ b) & (d >> 16)));
+	}
+
+	// Undo transformation prefix scan
+	uint64_t a = C ^ (C >> 1);
+	uint64_t b = D ^ (D >> 1);
+
+	// Recover index bits
+	uint64_t i0 = x ^ y;
+	uint64_t i1 = b | (0xFFFFFFFF ^ (i0 | a));
+
+	return uint64_interleave_2(i0, i1);
 }
 
-uint64_t gbox_get_sortable_hash(const GBOX *g)
+uint64_t
+gbox_get_sortable_hash(const GBOX *g, const int32_t srid)
 {
 	union floatuint {
 		uint32_t u;
@@ -773,20 +846,32 @@
 		p.z = (g->zmax + g->zmin) / 2.0;
 		normalize(&p);
 		cart2geog(&p, &gpt);
-		x.f = gpt.lon;
-		y.f = gpt.lat;
+		/* We know range for geography, so build the curve taking it into account */
+		x.f = 1.5 + gpt.lon / 512.0;
+		y.f = 1.5 + gpt.lat / 256.0;
 	}
 	else
 	{
+		x.f = (g->xmax + g->xmin) / 2;
+		y.f = (g->ymax + g->ymin) / 2;
 		/*
-		* Here we'd like to get two ordinates from 4 in the box.
-		* Since it's just a sortable bit representation we can omit division from (A+B)/2.
-		* All it should do is subtract 1 from exponent anyways.
-		*/
-		x.f = g->xmax + g->xmin;
-		y.f = g->ymax + g->ymin;
+		 * Tweak for popular SRID values: push floating point values into 1..2 range,
+		 * a region where exponent is constant and thus Hilbert curve
+		 * doesn't have compression artifact when X or Y value is close to 0.
+		 * If someone has out of bounds value it will still expose the arifact but not crash.
+		 * TODO: reconsider when we will have machinery to properly get bounds by SRID.
+		 */
+		if (srid == 3857 || srid == 3395)
+		{
+			x.f = 1.5 + x.f / 67108864.0;
+			y.f = 1.5 + y.f / 67108864.0;
+		}
+		else if (srid == 4326)
+		{
+			x.f = 1.5 + x.f / 512.0;
+			y.f = 1.5 + y.f / 256.0;
+		}
 	}
-	return uint32_interleave_2(x.u, y.u);
+
+	return uint32_hilbert(y.u, x.u);
 }
-
-

Modified: trunk/liblwgeom/gserialized.c
===================================================================
--- trunk/liblwgeom/gserialized.c	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/gserialized.c	2019-07-14 14:24:18 UTC (rev 17600)
@@ -309,127 +309,92 @@
 	) ? 0 : 1;
 }
 
+/* ORDER BY hash(g), g::bytea, ST_SRID(g), hasz(g), hasm(g) */
 int gserialized_cmp(const GSERIALIZED *g1, const GSERIALIZED *g2)
 {
-	union floatuint {
-		uint32_t u;
-		float f;
-	};
-
-	int g1_is_empty, g2_is_empty, cmp;
 	GBOX box1, box2;
 	uint64_t hash1, hash2;
 	size_t sz1 = SIZE_GET(g1->size);
 	size_t sz2 = SIZE_GET(g2->size);
-	union floatuint x, y;
-
 	size_t hsz1 = gserialized_header_size(g1);
 	size_t hsz2 = gserialized_header_size(g2);
-
-	/*
-	* For two non-same points, we can skip a lot of machinery.
-	*/
-	if (
-		sz1 > 16 && // 16 is size of EMPTY, if it's larger - it has coordinates
-		sz2 > 16 &&
-		hsz1 == 8 &&
-		hsz2 == 8 &&
-		*(uint32_t*)(g1->data) == POINTTYPE &&
-		*(uint32_t*)(g2->data) == POINTTYPE
-	)
-	{
-		double *dptr = (double*)(g1->data + 8);
-		x.f = 2.0 * dptr[0];
-		y.f = 2.0 * dptr[1];
-		hash1 = uint32_interleave_2(x.u, y.u);
-
-		dptr = (double*)(g2->data + 8);
-		x.f = 2.0 * dptr[0];
-		y.f = 2.0 * dptr[1];
-		hash2 = uint32_interleave_2(x.u, y.u);
-
-		/* If the SRIDs are the same, we can use hash inequality */
-		/* to jump us out of this function early. Otherwise we still */
-		/* have to do the full calculation */
-		if (gserialized_cmp_srid(g1, g2) == 0)
-		{
-			if (hash1 > hash2)
-				return 1;
-			if (hash1 < hash2)
-				return -1;
-		}
-
-		/* if hashes happen to be the same, go to full compare. */
-	}
-
 	uint8_t *b1 = (uint8_t*)g1 + hsz1;
 	uint8_t *b2 = (uint8_t*)g2 + hsz2;
 	size_t bsz1 = sz1 - hsz1;
 	size_t bsz2 = sz2 - hsz2;
-	size_t bsz = bsz1 < bsz2 ? bsz1 : bsz2;
+	size_t bsz_min = bsz1 < bsz2 ? bsz1 : bsz2;
 
+	/* Equality fast path */
+	/* Return equality for perfect equality only */
 	int cmp_srid = gserialized_cmp_srid(g1, g2);
+	int cmp = memcmp(b1, b2, bsz_min);
+	int g1hasz = gserialized_has_z(g1);
+	int g1hasm = gserialized_has_m(g1);
+	int g2hasz = gserialized_has_z(g2);
+	int g2hasm = gserialized_has_m(g2);
 
-	g1_is_empty = (gserialized_get_gbox_p(g1, &box1) == LW_FAILURE);
-	g2_is_empty = (gserialized_get_gbox_p(g2, &box2) == LW_FAILURE);
-
-	/* Empty < Non-empty */
-	if (g1_is_empty && !g2_is_empty)
-		return -1;
-
-	/* Non-empty > Empty */
-	if (!g1_is_empty && g2_is_empty)
-		return 1;
-
-	/* Return equality for perfect equality only */
-	cmp = memcmp(b1, b2, bsz);
-	if (bsz1 == bsz2 && cmp_srid == 0 && cmp == 0)
+	if (bsz1 == bsz2 && cmp_srid == 0 && cmp == 0 && g1hasz == g2hasz && g1hasm == g2hasm)
 		return 0;
-
-	if (!g1_is_empty && !g2_is_empty)
+	else
 	{
-		/* Using the centroids, calculate somewhat sortable */
-		/* hash key. The key doesn't provide good locality over */
-		/* the +/- boundary, but otherwise is pretty OK */
-		hash1 = gbox_get_sortable_hash(&box1);
-		hash2 = gbox_get_sortable_hash(&box2);
+		int g1_is_empty = (gserialized_get_gbox_p(g1, &box1) == LW_FAILURE);
+		int g2_is_empty = (gserialized_get_gbox_p(g2, &box2) == LW_FAILURE);
+		int32_t srid1 = gserialized_get_srid(g1);
+		int32_t srid2 = gserialized_get_srid(g2);
 
-		if (hash1 > hash2)
-			return 1;
-		else if (hash1 < hash2)
+		/* Empty < Non-empty */
+		if (g1_is_empty && !g2_is_empty)
 			return -1;
 
-		/* What, the hashes are equal? OK... sort on the */
-		/* box minima */
-		if (box1.xmin < box2.xmin)
-			return -1;
-		else if (box1.xmin > box2.xmin)
+		/* Non-empty > Empty */
+		if (!g1_is_empty && g2_is_empty)
 			return 1;
 
-		if (box1.ymin < box2.ymin)
-			return -1;
-		else if (box1.ymin > box2.ymin)
-			return 1;
+		if (!g1_is_empty && !g2_is_empty)
+		{
+			/* Using the boxes, calculate sortable hash key. */
+			hash1 = gbox_get_sortable_hash(&box1, srid1);
+			hash2 = gbox_get_sortable_hash(&box2, srid2);
 
-		/* Still equal? OK... sort on the box maxima */
-		if (box1.xmax < box2.xmax)
-			return -1;
-		else if (box1.xmax > box2.xmax)
-			return 1;
+			if (hash1 > hash2)
+				return 1;
+			if (hash1 < hash2)
+				return -1;
+		}
 
-		if (box1.ymax < box2.ymax)
-			return -1;
-		else if (box1.ymax > box2.ymax)
-			return 1;
-	}
+		/* Prefix comes before longer one. */
+		if (bsz1 != bsz2 && cmp == 0)
+		{
+			if (bsz1 < bsz2)
+				return -1;
+			else if (bsz1 > bsz2)
+				return 1;
+		}
 
-	/* Prefix comes before longer one. */
-	if (bsz1 != bsz2 && cmp == 0)
-	{
-		if (bsz1 < bsz2)
- 			return -1;
-		else if (bsz1 > bsz2)
- 			return 1;
+		/* If SRID is not equal, sort on it */
+		if (cmp_srid != 0)
+			return (srid1 > srid2) ? 1 : -1;
+
+		/* ZM flag sort*/
+		if (g1hasz != g2hasz)
+			return (g1hasz > g2hasz) ? 1 : -1;
+
+		if (g1hasm != g2hasm)
+			return (g1hasm > g2hasm) ? 1 : -1;
+
+		assert(cmp != 0);
+		return cmp > 0 ? 1 : -1;
 	}
-	return cmp > 0 ? 1 : -1;
 }
+
+uint64_t
+gserialized_get_sortable_hash(const GSERIALIZED *g)
+{
+	GBOX box;
+	int is_empty = (gserialized_get_gbox_p(g, &box) == LW_FAILURE);
+
+	if (is_empty)
+		return 0;
+	else
+		return gbox_get_sortable_hash(&box, gserialized_get_srid(g));
+}

Modified: trunk/liblwgeom/liblwgeom.h.in
===================================================================
--- trunk/liblwgeom/liblwgeom.h.in	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/liblwgeom.h.in	2019-07-14 14:24:18 UTC (rev 17600)
@@ -2043,12 +2043,16 @@
 extern int gbox_is_valid(const GBOX *gbox);
 
 /**
-* Return a sortable key based on the center point of the
-* GBOX.
+* Return a sortable key based on the center point of the GBOX.
 */
-extern uint64_t gbox_get_sortable_hash(const GBOX *g);
+extern uint64_t gbox_get_sortable_hash(const GBOX *g, const int32_t srid);
 
 /**
+* Return a sortable key based on gserialized.
+*/
+extern uint64_t gserialized_get_sortable_hash(const GSERIALIZED *g);
+
+/**
 * Utility function to get type number from string. For example, a string 'POINTZ'
 * would return type of 1 and z of 1 and m of 0. Valid
 */

Modified: trunk/liblwgeom/liblwgeom_internal.h
===================================================================
--- trunk/liblwgeom/liblwgeom_internal.h	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/liblwgeom_internal.h	2019-07-14 14:24:18 UTC (rev 17600)
@@ -167,7 +167,6 @@
 /* Machine endianness */
 #define XDR 0 /* big endian */
 #define NDR 1 /* little endian */
-uint64_t uint32_interleave_2(uint32_t u1, uint32_t u2);
 
 
 /*

Modified: trunk/liblwgeom/lwinline.h
===================================================================
--- trunk/liblwgeom/lwinline.h	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/lwinline.h	2019-07-14 14:24:18 UTC (rev 17600)
@@ -205,3 +205,15 @@
 		break;
 	}
 }
+
+/*
+ * This macro is based on PG_FREE_IF_COPY, except that it accepts two pointers.
+ * See PG_FREE_IF_COPY comment in src/include/fmgr.h in postgres source code
+ * for more details.
+ */
+#define POSTGIS_FREE_IF_COPY_P(ptrsrc, ptrori) \
+	do \
+	{ \
+		if ((Pointer)(ptrsrc) != (Pointer)(ptrori)) \
+			pfree(ptrsrc); \
+	} while (0)

Modified: trunk/liblwgeom/lwtree.c
===================================================================
--- trunk/liblwgeom/lwtree.c	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/lwtree.c	2019-07-14 14:24:18 UTC (rev 17600)
@@ -56,8 +56,8 @@
 	b2.ymin = n2->ymin;
 	b2.ymax = n2->ymax;
 
-	h1 = gbox_get_sortable_hash(&b1);
-	h2 = gbox_get_sortable_hash(&b2);
+	h1 = gbox_get_sortable_hash(&b1, 0);
+	h2 = gbox_get_sortable_hash(&b2, 0);
 	return h1 < h2 ? -1 : (h1 > h2 ? 1 : 0);
 }
 

Modified: trunk/libpgcommon/gserialized_gist.h
===================================================================
--- trunk/libpgcommon/gserialized_gist.h	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/libpgcommon/gserialized_gist.h	2019-07-14 14:24:18 UTC (rev 17600)
@@ -26,18 +26,6 @@
 #define GIDX_MAX_SIZE 36
 #define GIDX_MAX_DIM 4
 
-/*
- * This macro is based on PG_FREE_IF_COPY, except that it accepts two pointers.
- * See PG_FREE_IF_COPY comment in src/include/fmgr.h in postgres source code
- * for more details.
- */
-#define POSTGIS_FREE_IF_COPY_P(ptrsrc, ptrori) \
-	do \
-	{ \
-		if ((Pointer)(ptrsrc) != (Pointer)(ptrori)) \
-			pfree(ptrsrc); \
-	} while (0)
-
 /**********************************************************************
 **  BOX2DF structure.
 **

Modified: trunk/postgis/lwgeom_btree.c
===================================================================
--- trunk/postgis/lwgeom_btree.c	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/postgis/lwgeom_btree.c	2019-07-14 14:24:18 UTC (rev 17600)
@@ -29,6 +29,7 @@
 #include "fmgr.h"
 #include "access/hash.h"
 #include "utils/geo_decls.h"
+#include "utils/sortsupport.h" /* SortSupport */
 
 #include "../postgis_config.h"
 #include "liblwgeom.h"
@@ -131,10 +132,68 @@
 Datum lwgeom_hash(PG_FUNCTION_ARGS)
 {
 	GSERIALIZED *g1 = PG_GETARG_GSERIALIZED_P(0);
+
 	int32_t hval = gserialized_hash(g1);
 	PG_FREE_IF_COPY(g1, 0);
 	PG_RETURN_INT32(hval);
 }
 
+static int
+lwgeom_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+	/* Empty is a special case */
+	if (x == 0 || y == 0 || x == y)
+		return 0; /* 0 means "ask bigger comparator" and not equality*/
+	else if (x > y)
+		return 1;
+	else
+		return -1;
+}
 
+static int
+lwgeom_cmp_full(Datum x, Datum y, SortSupport ssup)
+{
+	GSERIALIZED *g1 = (GSERIALIZED *)PG_DETOAST_DATUM(x);
+	GSERIALIZED *g2 = (GSERIALIZED *)PG_DETOAST_DATUM(y);
+	int ret = gserialized_cmp(g1, g2);
+	POSTGIS_FREE_IF_COPY_P(g1, x);
+	POSTGIS_FREE_IF_COPY_P(g2, y);
+	return ret;
+}
 
+static bool
+lwgeom_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+	return LW_FALSE;
+}
+
+static Datum
+lwgeom_abbrev_convert(Datum original, SortSupport ssup)
+{
+	GSERIALIZED *g = (GSERIALIZED *)PG_DETOAST_DATUM(original);
+	uint64_t hash = gserialized_get_sortable_hash(g);
+	POSTGIS_FREE_IF_COPY_P(g, original);
+	return hash;
+}
+
+/*
+ * Sort support strategy routine
+ */
+PG_FUNCTION_INFO_V1(lwgeom_sortsupport);
+Datum lwgeom_sortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport ssup = (SortSupport)PG_GETARG_POINTER(0);
+
+	ssup->comparator = lwgeom_cmp_full;
+	ssup->ssup_extra = NULL;
+	/* Enable sortsupport only on 64 bit Datum */
+	if (ssup->abbreviate && sizeof(Datum) == 8)
+	{
+		ssup->comparator = lwgeom_cmp_abbrev;
+		ssup->abbrev_converter = lwgeom_abbrev_convert;
+		ssup->abbrev_abort = lwgeom_abbrev_abort;
+		ssup->abbrev_full_comparator = lwgeom_cmp_full;
+	}
+
+	PG_RETURN_VOID();
+}

Modified: trunk/postgis/postgis.sql.in
===================================================================
--- trunk/postgis/postgis.sql.in	2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/postgis/postgis.sql.in	2019-07-14 14:24:18 UTC (rev 17600)
@@ -377,6 +377,12 @@
 	AS 'MODULE_PATHNAME', 'lwgeom_cmp'
 	LANGUAGE 'c' IMMUTABLE STRICT _PARALLEL;
 
+-- Availability: 3.0.0
+CREATE OR REPLACE FUNCTION geometry_sortsupport(internal)
+	RETURNS void
+	AS 'MODULE_PATHNAME', 'lwgeom_sortsupport'
+	LANGUAGE 'c' IMMUTABLE STRICT _PARALLEL;
+
 --
 -- Sorting operators for Btree
 --
@@ -424,7 +430,9 @@
 	OPERATOR	3	= ,
 	OPERATOR	4	>= ,
 	OPERATOR	5	> ,
-	FUNCTION	1	geometry_cmp (geom1 geometry, geom2 geometry);
+	FUNCTION	1	geometry_cmp (geom1 geometry, geom2 geometry),
+	-- Availability: 3.0.0
+	FUNCTION	2	geometry_sortsupport(internal);
 
 --
 -- Sorting operators for Btree



More information about the postgis-tickets mailing list