[postgis-tickets] r17600 - Fast Hilbert btree.
Darafei
komzpa at gmail.com
Sun Jul 14 07:24:19 PDT 2019
Author: komzpa
Date: 2019-07-14 07:24:18 -0700 (Sun, 14 Jul 2019)
New Revision: 17600
Modified:
trunk/NEWS
trunk/liblwgeom/gbox.c
trunk/liblwgeom/gserialized.c
trunk/liblwgeom/liblwgeom.h.in
trunk/liblwgeom/liblwgeom_internal.h
trunk/liblwgeom/lwinline.h
trunk/liblwgeom/lwtree.c
trunk/libpgcommon/gserialized_gist.h
trunk/postgis/lwgeom_btree.c
trunk/postgis/postgis.sql.in
Log:
Fast Hilbert btree.
Requires btree indexes to be recreated.
Closes https://github.com/postgis/postgis/pull/436
Closes #3883
Modified: trunk/NEWS
===================================================================
--- trunk/NEWS 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/NEWS 2019-07-14 14:24:18 UTC (rev 17600)
@@ -110,6 +110,9 @@
- #4442, raster2pgsql now skips NODATA tiles. Use -k option if you still want
them in database for some reason. (Darafei Praliaskouski)
- #4433, 32-bit hash fix (requires reindexing hash(geometry) indexes) (Raúl Marín)
+ - #3383, Sorting now uses Hilbert curve and Postgres Abbreviated Compare.
+ You need to REINDEX your btree indexes if you had them.
+ (Darafei Praliaskouski)
* New Features *
- #2902, postgis_geos_noop (Sandro Santilli)
Modified: trunk/liblwgeom/gbox.c
===================================================================
--- trunk/liblwgeom/gbox.c 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/gbox.c 2019-07-14 14:24:18 UTC (rev 17600)
@@ -724,32 +724,105 @@
}
}
-uint64_t uint32_interleave_2(uint32_t u1, uint32_t u2)
+inline static uint64_t
+uint64_interleave_2(uint64_t x, uint64_t y)
{
- uint64_t x = u1;
- uint64_t y = u2;
- int i;
+ x = (x | (x << 16)) & 0x0000FFFF0000FFFF;
+ x = (x | (x << 8)) & 0x00FF00FF00FF00FF;
+ x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0F;
+ x = (x | (x << 2)) & 0x3333333333333333;
+ x = (x | (x << 1)) & 0x5555555555555555;
- static uint64_t B[5] =
- {
- 0x5555555555555555,
- 0x3333333333333333,
- 0x0F0F0F0F0F0F0F0F,
- 0x00FF00FF00FF00FF,
- 0x0000FFFF0000FFFF
- };
- static uint64_t S[5] = { 1, 2, 4, 8, 16 };
+ y = (y | (y << 16)) & 0x0000FFFF0000FFFF;
+ y = (y | (y << 8)) & 0x00FF00FF00FF00FF;
+ y = (y | (y << 4)) & 0x0F0F0F0F0F0F0F0F;
+ y = (y | (y << 2)) & 0x3333333333333333;
+ y = (y | (y << 1)) & 0x5555555555555555;
- for ( i = 4; i >= 0; i-- )
- {
- x = (x | (x << S[i])) & B[i];
- y = (y | (y << S[i])) & B[i];
- }
+ return x | (y << 1);
+}
- return x | (y << 1);
+/* Based on https://github.com/rawrunprotected/hilbert_curves Public Domain code */
+inline static uint64_t
+uint32_hilbert(uint32_t px, uint32_t py)
+{
+ uint64_t x = px;
+ uint64_t y = py;
+
+ uint64_t A, B, C, D;
+
+ // Initial prefix scan round, prime with x and y
+ {
+ uint64_t a = x ^ y;
+ uint64_t b = 0xFFFFFFFF ^ a;
+ uint64_t c = 0xFFFFFFFF ^ (x | y);
+ uint64_t d = x & (y ^ 0xFFFFFFFF);
+
+ A = a | (b >> 1);
+ B = (a >> 1) ^ a;
+ C = ((c >> 1) ^ (b & (d >> 1))) ^ c;
+ D = ((a & (c >> 1)) ^ (d >> 1)) ^ d;
+ }
+
+ {
+ uint64_t a = A;
+ uint64_t b = B;
+ uint64_t c = C;
+ uint64_t d = D;
+
+ A = ((a & (a >> 2)) ^ (b & (b >> 2)));
+ B = ((a & (b >> 2)) ^ (b & ((a ^ b) >> 2)));
+ C ^= ((a & (c >> 2)) ^ (b & (d >> 2)));
+ D ^= ((b & (c >> 2)) ^ ((a ^ b) & (d >> 2)));
+ }
+
+ {
+ uint64_t a = A;
+ uint64_t b = B;
+ uint64_t c = C;
+ uint64_t d = D;
+
+ A = ((a & (a >> 4)) ^ (b & (b >> 4)));
+ B = ((a & (b >> 4)) ^ (b & ((a ^ b) >> 4)));
+ C ^= ((a & (c >> 4)) ^ (b & (d >> 4)));
+ D ^= ((b & (c >> 4)) ^ ((a ^ b) & (d >> 4)));
+ }
+
+ {
+ uint64_t a = A;
+ uint64_t b = B;
+ uint64_t c = C;
+ uint64_t d = D;
+
+ A = ((a & (a >> 8)) ^ (b & (b >> 8)));
+ B = ((a & (b >> 8)) ^ (b & ((a ^ b) >> 8)));
+ C ^= ((a & (c >> 8)) ^ (b & (d >> 8)));
+ D ^= ((b & (c >> 8)) ^ ((a ^ b) & (d >> 8)));
+ }
+
+ {
+ uint64_t a = A;
+ uint64_t b = B;
+ uint64_t c = C;
+ uint64_t d = D;
+
+ C ^= ((a & (c >> 16)) ^ (b & (d >> 16)));
+ D ^= ((b & (c >> 16)) ^ ((a ^ b) & (d >> 16)));
+ }
+
+ // Undo transformation prefix scan
+ uint64_t a = C ^ (C >> 1);
+ uint64_t b = D ^ (D >> 1);
+
+ // Recover index bits
+ uint64_t i0 = x ^ y;
+ uint64_t i1 = b | (0xFFFFFFFF ^ (i0 | a));
+
+ return uint64_interleave_2(i0, i1);
}
-uint64_t gbox_get_sortable_hash(const GBOX *g)
+uint64_t
+gbox_get_sortable_hash(const GBOX *g, const int32_t srid)
{
union floatuint {
uint32_t u;
@@ -773,20 +846,32 @@
p.z = (g->zmax + g->zmin) / 2.0;
normalize(&p);
cart2geog(&p, &gpt);
- x.f = gpt.lon;
- y.f = gpt.lat;
+ /* We know range for geography, so build the curve taking it into account */
+ x.f = 1.5 + gpt.lon / 512.0;
+ y.f = 1.5 + gpt.lat / 256.0;
}
else
{
+ x.f = (g->xmax + g->xmin) / 2;
+ y.f = (g->ymax + g->ymin) / 2;
/*
- * Here we'd like to get two ordinates from 4 in the box.
- * Since it's just a sortable bit representation we can omit division from (A+B)/2.
- * All it should do is subtract 1 from exponent anyways.
- */
- x.f = g->xmax + g->xmin;
- y.f = g->ymax + g->ymin;
+ * Tweak for popular SRID values: push floating point values into 1..2 range,
+ * a region where exponent is constant and thus Hilbert curve
+ * doesn't have compression artifact when X or Y value is close to 0.
+ * If someone has out of bounds value it will still expose the arifact but not crash.
+ * TODO: reconsider when we will have machinery to properly get bounds by SRID.
+ */
+ if (srid == 3857 || srid == 3395)
+ {
+ x.f = 1.5 + x.f / 67108864.0;
+ y.f = 1.5 + y.f / 67108864.0;
+ }
+ else if (srid == 4326)
+ {
+ x.f = 1.5 + x.f / 512.0;
+ y.f = 1.5 + y.f / 256.0;
+ }
}
- return uint32_interleave_2(x.u, y.u);
+
+ return uint32_hilbert(y.u, x.u);
}
-
-
Modified: trunk/liblwgeom/gserialized.c
===================================================================
--- trunk/liblwgeom/gserialized.c 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/gserialized.c 2019-07-14 14:24:18 UTC (rev 17600)
@@ -309,127 +309,92 @@
) ? 0 : 1;
}
+/* ORDER BY hash(g), g::bytea, ST_SRID(g), hasz(g), hasm(g) */
int gserialized_cmp(const GSERIALIZED *g1, const GSERIALIZED *g2)
{
- union floatuint {
- uint32_t u;
- float f;
- };
-
- int g1_is_empty, g2_is_empty, cmp;
GBOX box1, box2;
uint64_t hash1, hash2;
size_t sz1 = SIZE_GET(g1->size);
size_t sz2 = SIZE_GET(g2->size);
- union floatuint x, y;
-
size_t hsz1 = gserialized_header_size(g1);
size_t hsz2 = gserialized_header_size(g2);
-
- /*
- * For two non-same points, we can skip a lot of machinery.
- */
- if (
- sz1 > 16 && // 16 is size of EMPTY, if it's larger - it has coordinates
- sz2 > 16 &&
- hsz1 == 8 &&
- hsz2 == 8 &&
- *(uint32_t*)(g1->data) == POINTTYPE &&
- *(uint32_t*)(g2->data) == POINTTYPE
- )
- {
- double *dptr = (double*)(g1->data + 8);
- x.f = 2.0 * dptr[0];
- y.f = 2.0 * dptr[1];
- hash1 = uint32_interleave_2(x.u, y.u);
-
- dptr = (double*)(g2->data + 8);
- x.f = 2.0 * dptr[0];
- y.f = 2.0 * dptr[1];
- hash2 = uint32_interleave_2(x.u, y.u);
-
- /* If the SRIDs are the same, we can use hash inequality */
- /* to jump us out of this function early. Otherwise we still */
- /* have to do the full calculation */
- if (gserialized_cmp_srid(g1, g2) == 0)
- {
- if (hash1 > hash2)
- return 1;
- if (hash1 < hash2)
- return -1;
- }
-
- /* if hashes happen to be the same, go to full compare. */
- }
-
uint8_t *b1 = (uint8_t*)g1 + hsz1;
uint8_t *b2 = (uint8_t*)g2 + hsz2;
size_t bsz1 = sz1 - hsz1;
size_t bsz2 = sz2 - hsz2;
- size_t bsz = bsz1 < bsz2 ? bsz1 : bsz2;
+ size_t bsz_min = bsz1 < bsz2 ? bsz1 : bsz2;
+ /* Equality fast path */
+ /* Return equality for perfect equality only */
int cmp_srid = gserialized_cmp_srid(g1, g2);
+ int cmp = memcmp(b1, b2, bsz_min);
+ int g1hasz = gserialized_has_z(g1);
+ int g1hasm = gserialized_has_m(g1);
+ int g2hasz = gserialized_has_z(g2);
+ int g2hasm = gserialized_has_m(g2);
- g1_is_empty = (gserialized_get_gbox_p(g1, &box1) == LW_FAILURE);
- g2_is_empty = (gserialized_get_gbox_p(g2, &box2) == LW_FAILURE);
-
- /* Empty < Non-empty */
- if (g1_is_empty && !g2_is_empty)
- return -1;
-
- /* Non-empty > Empty */
- if (!g1_is_empty && g2_is_empty)
- return 1;
-
- /* Return equality for perfect equality only */
- cmp = memcmp(b1, b2, bsz);
- if (bsz1 == bsz2 && cmp_srid == 0 && cmp == 0)
+ if (bsz1 == bsz2 && cmp_srid == 0 && cmp == 0 && g1hasz == g2hasz && g1hasm == g2hasm)
return 0;
-
- if (!g1_is_empty && !g2_is_empty)
+ else
{
- /* Using the centroids, calculate somewhat sortable */
- /* hash key. The key doesn't provide good locality over */
- /* the +/- boundary, but otherwise is pretty OK */
- hash1 = gbox_get_sortable_hash(&box1);
- hash2 = gbox_get_sortable_hash(&box2);
+ int g1_is_empty = (gserialized_get_gbox_p(g1, &box1) == LW_FAILURE);
+ int g2_is_empty = (gserialized_get_gbox_p(g2, &box2) == LW_FAILURE);
+ int32_t srid1 = gserialized_get_srid(g1);
+ int32_t srid2 = gserialized_get_srid(g2);
- if (hash1 > hash2)
- return 1;
- else if (hash1 < hash2)
+ /* Empty < Non-empty */
+ if (g1_is_empty && !g2_is_empty)
return -1;
- /* What, the hashes are equal? OK... sort on the */
- /* box minima */
- if (box1.xmin < box2.xmin)
- return -1;
- else if (box1.xmin > box2.xmin)
+ /* Non-empty > Empty */
+ if (!g1_is_empty && g2_is_empty)
return 1;
- if (box1.ymin < box2.ymin)
- return -1;
- else if (box1.ymin > box2.ymin)
- return 1;
+ if (!g1_is_empty && !g2_is_empty)
+ {
+ /* Using the boxes, calculate sortable hash key. */
+ hash1 = gbox_get_sortable_hash(&box1, srid1);
+ hash2 = gbox_get_sortable_hash(&box2, srid2);
- /* Still equal? OK... sort on the box maxima */
- if (box1.xmax < box2.xmax)
- return -1;
- else if (box1.xmax > box2.xmax)
- return 1;
+ if (hash1 > hash2)
+ return 1;
+ if (hash1 < hash2)
+ return -1;
+ }
- if (box1.ymax < box2.ymax)
- return -1;
- else if (box1.ymax > box2.ymax)
- return 1;
- }
+ /* Prefix comes before longer one. */
+ if (bsz1 != bsz2 && cmp == 0)
+ {
+ if (bsz1 < bsz2)
+ return -1;
+ else if (bsz1 > bsz2)
+ return 1;
+ }
- /* Prefix comes before longer one. */
- if (bsz1 != bsz2 && cmp == 0)
- {
- if (bsz1 < bsz2)
- return -1;
- else if (bsz1 > bsz2)
- return 1;
+ /* If SRID is not equal, sort on it */
+ if (cmp_srid != 0)
+ return (srid1 > srid2) ? 1 : -1;
+
+ /* ZM flag sort*/
+ if (g1hasz != g2hasz)
+ return (g1hasz > g2hasz) ? 1 : -1;
+
+ if (g1hasm != g2hasm)
+ return (g1hasm > g2hasm) ? 1 : -1;
+
+ assert(cmp != 0);
+ return cmp > 0 ? 1 : -1;
}
- return cmp > 0 ? 1 : -1;
}
+
+uint64_t
+gserialized_get_sortable_hash(const GSERIALIZED *g)
+{
+ GBOX box;
+ int is_empty = (gserialized_get_gbox_p(g, &box) == LW_FAILURE);
+
+ if (is_empty)
+ return 0;
+ else
+ return gbox_get_sortable_hash(&box, gserialized_get_srid(g));
+}
Modified: trunk/liblwgeom/liblwgeom.h.in
===================================================================
--- trunk/liblwgeom/liblwgeom.h.in 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/liblwgeom.h.in 2019-07-14 14:24:18 UTC (rev 17600)
@@ -2043,12 +2043,16 @@
extern int gbox_is_valid(const GBOX *gbox);
/**
-* Return a sortable key based on the center point of the
-* GBOX.
+* Return a sortable key based on the center point of the GBOX.
*/
-extern uint64_t gbox_get_sortable_hash(const GBOX *g);
+extern uint64_t gbox_get_sortable_hash(const GBOX *g, const int32_t srid);
/**
+* Return a sortable key based on gserialized.
+*/
+extern uint64_t gserialized_get_sortable_hash(const GSERIALIZED *g);
+
+/**
* Utility function to get type number from string. For example, a string 'POINTZ'
* would return type of 1 and z of 1 and m of 0. Valid
*/
Modified: trunk/liblwgeom/liblwgeom_internal.h
===================================================================
--- trunk/liblwgeom/liblwgeom_internal.h 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/liblwgeom_internal.h 2019-07-14 14:24:18 UTC (rev 17600)
@@ -167,7 +167,6 @@
/* Machine endianness */
#define XDR 0 /* big endian */
#define NDR 1 /* little endian */
-uint64_t uint32_interleave_2(uint32_t u1, uint32_t u2);
/*
Modified: trunk/liblwgeom/lwinline.h
===================================================================
--- trunk/liblwgeom/lwinline.h 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/lwinline.h 2019-07-14 14:24:18 UTC (rev 17600)
@@ -205,3 +205,15 @@
break;
}
}
+
+/*
+ * This macro is based on PG_FREE_IF_COPY, except that it accepts two pointers.
+ * See PG_FREE_IF_COPY comment in src/include/fmgr.h in postgres source code
+ * for more details.
+ */
+#define POSTGIS_FREE_IF_COPY_P(ptrsrc, ptrori) \
+ do \
+ { \
+ if ((Pointer)(ptrsrc) != (Pointer)(ptrori)) \
+ pfree(ptrsrc); \
+ } while (0)
Modified: trunk/liblwgeom/lwtree.c
===================================================================
--- trunk/liblwgeom/lwtree.c 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/liblwgeom/lwtree.c 2019-07-14 14:24:18 UTC (rev 17600)
@@ -56,8 +56,8 @@
b2.ymin = n2->ymin;
b2.ymax = n2->ymax;
- h1 = gbox_get_sortable_hash(&b1);
- h2 = gbox_get_sortable_hash(&b2);
+ h1 = gbox_get_sortable_hash(&b1, 0);
+ h2 = gbox_get_sortable_hash(&b2, 0);
return h1 < h2 ? -1 : (h1 > h2 ? 1 : 0);
}
Modified: trunk/libpgcommon/gserialized_gist.h
===================================================================
--- trunk/libpgcommon/gserialized_gist.h 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/libpgcommon/gserialized_gist.h 2019-07-14 14:24:18 UTC (rev 17600)
@@ -26,18 +26,6 @@
#define GIDX_MAX_SIZE 36
#define GIDX_MAX_DIM 4
-/*
- * This macro is based on PG_FREE_IF_COPY, except that it accepts two pointers.
- * See PG_FREE_IF_COPY comment in src/include/fmgr.h in postgres source code
- * for more details.
- */
-#define POSTGIS_FREE_IF_COPY_P(ptrsrc, ptrori) \
- do \
- { \
- if ((Pointer)(ptrsrc) != (Pointer)(ptrori)) \
- pfree(ptrsrc); \
- } while (0)
-
/**********************************************************************
** BOX2DF structure.
**
Modified: trunk/postgis/lwgeom_btree.c
===================================================================
--- trunk/postgis/lwgeom_btree.c 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/postgis/lwgeom_btree.c 2019-07-14 14:24:18 UTC (rev 17600)
@@ -29,6 +29,7 @@
#include "fmgr.h"
#include "access/hash.h"
#include "utils/geo_decls.h"
+#include "utils/sortsupport.h" /* SortSupport */
#include "../postgis_config.h"
#include "liblwgeom.h"
@@ -131,10 +132,68 @@
Datum lwgeom_hash(PG_FUNCTION_ARGS)
{
GSERIALIZED *g1 = PG_GETARG_GSERIALIZED_P(0);
+
int32_t hval = gserialized_hash(g1);
PG_FREE_IF_COPY(g1, 0);
PG_RETURN_INT32(hval);
}
+static int
+lwgeom_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+ /* Empty is a special case */
+ if (x == 0 || y == 0 || x == y)
+ return 0; /* 0 means "ask bigger comparator" and not equality*/
+ else if (x > y)
+ return 1;
+ else
+ return -1;
+}
+static int
+lwgeom_cmp_full(Datum x, Datum y, SortSupport ssup)
+{
+ GSERIALIZED *g1 = (GSERIALIZED *)PG_DETOAST_DATUM(x);
+ GSERIALIZED *g2 = (GSERIALIZED *)PG_DETOAST_DATUM(y);
+ int ret = gserialized_cmp(g1, g2);
+ POSTGIS_FREE_IF_COPY_P(g1, x);
+ POSTGIS_FREE_IF_COPY_P(g2, y);
+ return ret;
+}
+static bool
+lwgeom_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+ return LW_FALSE;
+}
+
+static Datum
+lwgeom_abbrev_convert(Datum original, SortSupport ssup)
+{
+ GSERIALIZED *g = (GSERIALIZED *)PG_DETOAST_DATUM(original);
+ uint64_t hash = gserialized_get_sortable_hash(g);
+ POSTGIS_FREE_IF_COPY_P(g, original);
+ return hash;
+}
+
+/*
+ * Sort support strategy routine
+ */
+PG_FUNCTION_INFO_V1(lwgeom_sortsupport);
+Datum lwgeom_sortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport)PG_GETARG_POINTER(0);
+
+ ssup->comparator = lwgeom_cmp_full;
+ ssup->ssup_extra = NULL;
+ /* Enable sortsupport only on 64 bit Datum */
+ if (ssup->abbreviate && sizeof(Datum) == 8)
+ {
+ ssup->comparator = lwgeom_cmp_abbrev;
+ ssup->abbrev_converter = lwgeom_abbrev_convert;
+ ssup->abbrev_abort = lwgeom_abbrev_abort;
+ ssup->abbrev_full_comparator = lwgeom_cmp_full;
+ }
+
+ PG_RETURN_VOID();
+}
Modified: trunk/postgis/postgis.sql.in
===================================================================
--- trunk/postgis/postgis.sql.in 2019-07-14 05:06:19 UTC (rev 17599)
+++ trunk/postgis/postgis.sql.in 2019-07-14 14:24:18 UTC (rev 17600)
@@ -377,6 +377,12 @@
AS 'MODULE_PATHNAME', 'lwgeom_cmp'
LANGUAGE 'c' IMMUTABLE STRICT _PARALLEL;
+-- Availability: 3.0.0
+CREATE OR REPLACE FUNCTION geometry_sortsupport(internal)
+ RETURNS void
+ AS 'MODULE_PATHNAME', 'lwgeom_sortsupport'
+ LANGUAGE 'c' IMMUTABLE STRICT _PARALLEL;
+
--
-- Sorting operators for Btree
--
@@ -424,7 +430,9 @@
OPERATOR 3 = ,
OPERATOR 4 >= ,
OPERATOR 5 > ,
- FUNCTION 1 geometry_cmp (geom1 geometry, geom2 geometry);
+ FUNCTION 1 geometry_cmp (geom1 geometry, geom2 geometry),
+ -- Availability: 3.0.0
+ FUNCTION 2 geometry_sortsupport(internal);
--
-- Sorting operators for Btree
More information about the postgis-tickets
mailing list