[SCM] PostGIS branch stable-3.5 updated. 3.5.4-5-g06fbe94ff

git at osgeo.org git at osgeo.org
Fri Nov 7 10:47:39 PST 2025


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "PostGIS".

The branch, stable-3.5 has been updated
       via  06fbe94ffe37c1e9d26328b7554c2050feaae1b6 (commit)
       via  6b8bae5486f12dc53634f1d7982186c6413d0ef5 (commit)
      from  949a623cd0afc9802ebb228d8dc7b477272b9b18 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 06fbe94ffe37c1e9d26328b7554c2050feaae1b6
Author: Darafei Praliaskouski <me at komzpa.net>
Date:   Thu Oct 30 03:46:26 2025 +0400

    Guard against histogram axis dimension underflow
    
    References #5959
    References #5984

diff --git a/NEWS b/NEWS
index df71a295a..075318f84 100644
--- a/NEWS
+++ b/NEWS
@@ -6,11 +6,7 @@ PostgreSQL 12-18 required. GEOS 3.8+ required. Proj 6.1+ required.
 
 * Bug fixes *
 
-
-
-* Bug Fixes *
-
- - #5959, Prevent histogram target overflow when analysing massive tables (Darafei Praliaskouski)
+  - #5959, #5984, Prevent histogram target overflow when analysing massive tables (Darafei Praliaskouski)
 
 
 PostGIS 3.5.4
@@ -3083,4 +3079,3 @@ PostGIS 0.1
 - truely_inside()
 - rtree index support functions
 - gist index support functions
-
diff --git a/postgis/cunit/cu_tester.c b/postgis/cunit/cu_tester.c
index b4dd46aa7..eb6ba5b0d 100644
--- a/postgis/cunit/cu_tester.c
+++ b/postgis/cunit/cu_tester.c
@@ -28,6 +28,7 @@
 
 #include <CUnit/Basic.h>
 #include <limits.h>
+#include <math.h>
 #include <string.h>
 
 #include "../gserialized_estimate_support.h"
@@ -82,6 +83,23 @@ histogram_budget_clamps(void)
 	CU_ASSERT_EQUAL(histogram_cell_budget((double)INT_MAX, 50000, INT_MAX), INT_MAX);
 }
 
+static void
+histogram_axis_allocation_guards(void)
+{
+	/* Baseline: evenly split a 10k target over two varying dimensions. */
+	CU_ASSERT_EQUAL(histogram_axis_cells(10000, 2, 0.5), 100);
+
+	/* Skewed axis ratios that collapse to tiny powers still return one cell. */
+	CU_ASSERT_EQUAL(histogram_axis_cells(10000, 2, 1e-9), 1);
+
+	/* Denormals, NaNs and negative ratios should not leak to the histogram. */
+	CU_ASSERT_EQUAL(histogram_axis_cells(10000, 2, NAN), 1);
+	CU_ASSERT_EQUAL(histogram_axis_cells(10000, 2, -0.5), 1);
+
+	/* Extremely aggressive ratios remain bounded by the square root of the budget. */
+	CU_ASSERT_EQUAL(histogram_axis_cells(INT_MAX, 2, 1.0), (int)sqrt((double)INT_MAX * 2.0));
+}
+
 static void
 nd_stats_indexing_behaviour(void)
 {
@@ -138,6 +156,7 @@ main(void)
 		goto cleanup;
 
 	if (!CU_add_test(suite, "histogram budget clamps", histogram_budget_clamps) ||
+	    !CU_add_test(suite, "histogram axis guards", histogram_axis_allocation_guards) ||
 	    !CU_add_test(suite, "nd_stats value index guards", nd_stats_indexing_behaviour) ||
 	    !CU_add_test(suite, "nd_box ratio edge cases", nd_box_ratio_cases))
 	{
diff --git a/postgis/gserialized_estimate.c b/postgis/gserialized_estimate.c
index 072d60a1c..e561fd5dd 100644
--- a/postgis/gserialized_estimate.c
+++ b/postgis/gserialized_estimate.c
@@ -1516,11 +1516,10 @@ compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfu
 				 * Scale the target cells number by the # of dims and ratio,
 				 * then take the appropriate root to get the estimated number of cells
 				 * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
-				*/
-				histo_size[d] = (int)pow((double)histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
-				/* If something goes awry, just give this dim one slot */
-				if ( ! histo_size[d] )
-					histo_size[d] = 1;
+				 * The dedicated helper clamps pathological floating point inputs so we
+				 * do not resurrect the NaN propagation reported in #5959 on amd64.
+				 */
+				histo_size[d] = histogram_axis_cells(histo_cells_target, histo_ndims, edge_ratio);
 			}
 			histo_cells_new *= histo_size[d];
 		}
diff --git a/postgis/gserialized_estimate_support.h b/postgis/gserialized_estimate_support.h
index 0d3a23d75..6b372a43e 100644
--- a/postgis/gserialized_estimate_support.h
+++ b/postgis/gserialized_estimate_support.h
@@ -151,6 +151,46 @@ histogram_cell_budget(double total_rows, int ndims, int attstattarget)
 	return (int)budget;
 }
 
+/*
+ * Allocate histogram buckets along a single axis in proportion to the observed
+ * density variation.  The caller passes in the global histogram target along
+ * with the number of axes that exhibited variation in the sampled data and the
+ * relative contribution of the current axis (edge_ratio).  Earlier versions
+ * evaluated the pow() call directly in the caller, which exposed the planner to
+ * NaN propagation on some amd64 builds when the ratio was denormal or negative
+ * (see #5959).  Keeping the calculation in one place allows us to clamp the
+ * inputs and provide a predictable fallback for problematic floating point
+ * combinations.
+ */
+static inline int
+histogram_axis_cells(int histo_cells_target, int histo_ndims, double edge_ratio)
+{
+	double scaled;
+	double axis_cells;
+
+	if (histo_cells_target <= 0 || histo_ndims <= 0)
+		return 1;
+
+	if (!(edge_ratio > 0.0) || !isfinite(edge_ratio))
+		return 1;
+
+	scaled = (double)histo_cells_target * (double)histo_ndims * edge_ratio;
+	if (!(scaled > 0.0) || !isfinite(scaled))
+		return 1;
+
+	axis_cells = pow(scaled, 1.0 / (double)histo_ndims);
+	if (!(axis_cells > 0.0) || !isfinite(axis_cells))
+		return 1;
+
+	if (axis_cells >= (double)INT_MAX)
+		return INT_MAX;
+
+	if (axis_cells <= 1.0)
+		return 1;
+
+	return (int)axis_cells;
+}
+
 /*
  * Compute the portion of 'target' covered by 'cover'.  The caller supplies the
  * dimensionality because ND_BOX always carries four slots.  Degenerate volumes

commit 6b8bae5486f12dc53634f1d7982186c6413d0ef5
Author: Darafei Praliaskouski <me at komzpa.net>
Date:   Thu Oct 30 02:55:59 2025 +0400

    Prevent histogram target overflow when analysing massive tables
    
    Add CUnit tests for overflow scenarios
    
    Closes #5959

diff --git a/NEWS b/NEWS
index 72aaf5e0d..df71a295a 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,10 @@ PostgreSQL 12-18 required. GEOS 3.8+ required. Proj 6.1+ required.
 
 
 
+* Bug Fixes *
+
+ - #5959, Prevent histogram target overflow when analysing massive tables (Darafei Praliaskouski)
+
 
 PostGIS 3.5.4
 2025/10/16
diff --git a/configure.ac b/configure.ac
index a60828c9d..0411756d7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1905,6 +1905,7 @@ AC_CONFIG_FILES([GNUmakefile
    libpgcommon/Makefile
    libpgcommon/cunit/Makefile
    postgis/Makefile
+   postgis/cunit/Makefile
    postgis/sqldefines.h
    sfcgal/Makefile
    $SFCGAL_MAKEFILE_LIST
diff --git a/postgis/cunit/Makefile.in b/postgis/cunit/Makefile.in
new file mode 100644
index 000000000..483e4ca10
--- /dev/null
+++ b/postgis/cunit/Makefile.in
@@ -0,0 +1,43 @@
+# **********************************************************************
+# *
+# * PostGIS - Spatial Types for PostgreSQL
+# * http://postgis.net
+# *
+# * Copyright 2025 Darafei Praliaskouski <me at komzpa.net>
+# *
+# * This is free software; you can redistribute and/or modify it under
+# * the terms of the GNU General Public Licence. See the COPYING file.
+# *
+# **********************************************************************
+
+srcdir = @srcdir@
+top_builddir = @top_builddir@
+
+CC=@CC@
+LIBTOOL=@LIBTOOL@
+CFLAGS = @CFLAGS@ @CPPFLAGS@ @PGSQL_BE_CPPFLAGS@ @CUNIT_CPPFLAGS@ -I.. -I$(top_builddir) -I at top_srcdir@/liblwgeom -I at top_builddir@/liblwgeom -I at top_srcdir@/libpgcommon -I at top_builddir@/libpgcommon
+LDFLAGS = @CUNIT_LDFLAGS@ -lm
+
+VPATH = $(srcdir)
+
+OBJS = cu_tester.o
+
+# Build the standalone histogram helper tester.
+all: cu_tester
+
+# Execute the suite directly; no installation step is required.
+check: all
+	$(LIBTOOL) --mode=execute ./cu_tester
+
+# Link the tester with libtool; all helper code is header-only.
+cu_tester: $(OBJS)
+	$(LIBTOOL) --mode=link $(CC) $(CFLAGS) -o $@ $(OBJS) $(LDFLAGS)
+
+%.o: %.c
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+clean:
+	rm -f $(OBJS) cu_tester
+
+clobber distclean: clean
+	rm -f Makefile
diff --git a/postgis/cunit/cu_tester.c b/postgis/cunit/cu_tester.c
new file mode 100644
index 000000000..b4dd46aa7
--- /dev/null
+++ b/postgis/cunit/cu_tester.c
@@ -0,0 +1,154 @@
+/**********************************************************************
+ *
+ * PostGIS - Spatial Types for PostgreSQL
+ * http://postgis.net
+ *
+ * This file is part of PostGIS
+ *
+ * PostGIS is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * PostGIS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PostGIS.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************
+ *
+ * Copyright 2025 (C) Darafei Praliaskouski <me at komzpa.net>
+ *
+ **********************************************************************/
+
+#include "postgres.h"
+
+#include <CUnit/Basic.h>
+#include <limits.h>
+#include <string.h>
+
+#include "../gserialized_estimate_support.h"
+
+static ND_BOX
+make_box(float minx, float miny, float minz, float minm, float maxx, float maxy, float maxz, float maxm)
+{
+	ND_BOX box;
+
+	memset(&box, 0, sizeof(box));
+	box.min[0] = minx;
+	box.min[1] = miny;
+	box.min[2] = minz;
+	box.min[3] = minm;
+	box.max[0] = maxx;
+	box.max[1] = maxy;
+	box.max[2] = maxz;
+	box.max[3] = maxm;
+	return box;
+}
+
+static void
+histogram_budget_clamps(void)
+{
+	/* Zero or negative row counts disable histogram construction. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(0.0, 2, 100), 0);
+	CU_ASSERT_EQUAL(histogram_cell_budget(-1.0, 4, 100), 0);
+
+	/* Degenerate dimensionality cannot allocate histogram space. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(1000.0, 0, 100), 0);
+
+	/* Matches the classic pow(attstattarget, ndims) path. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(1e6, 2, 100), 10000);
+	CU_ASSERT_EQUAL(histogram_cell_budget(1e6, 3, 50), 125000);
+
+	/* attstattarget^ndims exceeds ndims * 100000 and must be clamped. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(1e6, 4, 50), 400000);
+
+	/* attstattarget<=0 is normalised to the smallest viable target. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(1e6, 2, 0), 1);
+
+	/* Row clamp shrinks the grid for small relations. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(1.0, 2, 100), 20);
+
+	/* Large tables now preserve the dimensional cap instead of overflowing. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(1.5e8, 2, 100), 10000);
+
+	/* Regression for #5984: huge attstat targets stabilise instead of wrapping. */
+	CU_ASSERT_EQUAL(histogram_cell_budget(5e6, 2, 10000), 200000);
+
+	/* Trigger the INT_MAX guard once both other caps exceed it. */
+	CU_ASSERT_EQUAL(histogram_cell_budget((double)INT_MAX, 50000, INT_MAX), INT_MAX);
+}
+
+static void
+nd_stats_indexing_behaviour(void)
+{
+	ND_STATS stats;
+	const int good_index[ND_DIMS] = {1, 2, 0, 0};
+	const int bad_index[ND_DIMS] = {1, 5, 0, 0};
+
+	memset(&stats, 0, sizeof(stats));
+	stats.ndims = 3;
+	stats.size[0] = 4.0f;
+	stats.size[1] = 5.0f;
+	stats.size[2] = 3.0f;
+
+	/* Three-dimensional index  (x=1, y=2, z=0) collapses into 1 + 2 * 4. */
+	CU_ASSERT_EQUAL(nd_stats_value_index(&stats, good_index), 1 + 2 * 4);
+	/* Any request outside the histogram bounds triggers a guard. */
+	CU_ASSERT_EQUAL(nd_stats_value_index(&stats, bad_index), -1);
+
+	/* Regression for #5959: ndims higher than populated sizes still honours guards. */
+	stats.ndims = 4;
+	CU_ASSERT_EQUAL(nd_stats_value_index(&stats, good_index), -1);
+}
+
+static void
+nd_box_ratio_cases(void)
+{
+	ND_BOX covering = make_box(0.0f, 0.0f, 0.0f, 0.0f, 2.0f, 2.0f, 2.0f, 0.0f);
+	ND_BOX interior = make_box(0.5f, 0.5f, 0.5f, 0.0f, 1.5f, 1.5f, 1.5f, 0.0f);
+	ND_BOX partial = make_box(0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f, 0.0f);
+	ND_BOX target = make_box(0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f);
+	ND_BOX flat = make_box(0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);
+	ND_BOX touch = make_box(2.0f, 0.0f, 0.0f, 0.0f, 3.0f, 1.0f, 1.0f, 0.0f);
+
+	/* Full coverage should evaluate to one regardless of the extra extent. */
+	CU_ASSERT_DOUBLE_EQUAL(nd_box_ratio(&covering, &interior, 3), 1.0, 1e-12);
+	/* A shared octant carries one eighth of the reference volume. */
+	CU_ASSERT_DOUBLE_EQUAL(nd_box_ratio(&partial, &target, 3), 0.125, 1e-12);
+	/* Degenerate slabs have zero volume in three dimensions. */
+	CU_ASSERT_DOUBLE_EQUAL(nd_box_ratio(&covering, &flat, 3), 0.0, 1e-12);
+	/* Boxes that only touch along a face should not count as overlap. */
+	CU_ASSERT_DOUBLE_EQUAL(nd_box_ratio(&covering, &touch, 3), 0.0, 1e-12);
+}
+
+int
+main(void)
+{
+	CU_pSuite suite;
+	unsigned int failures = 0;
+	if (CU_initialize_registry() != CUE_SUCCESS)
+		return CU_get_error();
+
+	suite = CU_add_suite("gserialized_histogram_helpers", NULL, NULL);
+	if (!suite)
+		goto cleanup;
+
+	if (!CU_add_test(suite, "histogram budget clamps", histogram_budget_clamps) ||
+	    !CU_add_test(suite, "nd_stats value index guards", nd_stats_indexing_behaviour) ||
+	    !CU_add_test(suite, "nd_box ratio edge cases", nd_box_ratio_cases))
+	{
+		goto cleanup;
+	}
+
+	CU_basic_set_mode(CU_BRM_VERBOSE);
+	CU_basic_run_tests();
+
+cleanup:
+	failures = CU_get_number_of_tests_failed();
+	CU_cleanup_registry();
+	return failures == 0 ? CUE_SUCCESS : 1;
+}
diff --git a/postgis/gserialized_estimate.c b/postgis/gserialized_estimate.c
index 24984e9d7..072d60a1c 100644
--- a/postgis/gserialized_estimate.c
+++ b/postgis/gserialized_estimate.c
@@ -19,11 +19,10 @@
  **********************************************************************
  *
  * Copyright 2012 (C) Paul Ramsey <pramsey at cleverelephant.ca>
+ * Copyright 2025 (C) Darafei Praliaskouski <me at komzpa.net>
  *
  **********************************************************************/
 
-
-
 /**********************************************************************
  THEORY OF OPERATION
 
@@ -112,10 +111,12 @@ dimensionality cases. (2D geometry) &&& (3D column), etc.
 #include "stringbuffer.h"
 #include "liblwgeom.h"
 #include "lwgeodetic.h"
-#include "lwgeom_pg.h"       /* For debugging macros. */
+#include "lwgeom_pg.h"        /* For debugging macros. */
 #include "gserialized_gist.h" /* For index common functions */
+#include "gserialized_estimate_support.h"
 
 #include <math.h>
+#include <limits.h>
 #if HAVE_IEEEFP_H
 #include <ieeefp.h>
 #endif
@@ -144,8 +145,7 @@ Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
 
 /* Local prototypes */
 static Oid table_get_spatial_index(Oid tbl_oid, int16 attnum, int *key_type, int16 *idx_attnum);
-static GBOX * spatial_index_read_extent(Oid idx_oid, int idx_att_num, int key_type);
-
+static GBOX *spatial_index_read_extent(Oid idx_oid, int idx_att_num, int key_type);
 
 /* Other prototypes */
 float8 gserialized_joinsel_internal(PlannerInfo *root, List *args, JoinType jointype, int mode);
@@ -186,13 +186,6 @@ Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
 */
 #define SDFACTOR 3.25
 
-/**
-* The maximum number of dimensions our code can handle.
-* We'll use this to statically allocate a bunch of
-* arrays below.
-*/
-#define ND_DIMS 4
-
 /**
 * Minimum width of a dimension that we'll bother trying to
 * compute statistics on. Bearing in mind we have no control
@@ -219,68 +212,6 @@ Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
 #define FALLBACK_ND_SEL 0.2
 #define FALLBACK_ND_JOINSEL 0.3
 
-/**
-* N-dimensional box type for calculations, to avoid doing
-* explicit axis conversions from GBOX in all calculations
-* at every step.
-*/
-typedef struct ND_BOX_T
-{
-	float4 min[ND_DIMS];
-	float4 max[ND_DIMS];
-} ND_BOX;
-
-/**
-* N-dimensional box index type
-*/
-typedef struct ND_IBOX_T
-{
-	int min[ND_DIMS];
-	int max[ND_DIMS];
-} ND_IBOX;
-
-
-/**
-* N-dimensional statistics structure. Well, actually
-* four-dimensional, but set up to handle arbitrary dimensions
-* if necessary (really, we just want to get the 2,3,4-d cases
-* into one shared piece of code).
-*/
-typedef struct ND_STATS_T
-{
-	/* Dimensionality of the histogram. */
-	float4 ndims;
-
-	/* Size of n-d histogram in each dimension. */
-	float4 size[ND_DIMS];
-
-	/* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
-	ND_BOX extent;
-
-	/* How many rows in the table itself? */
-	float4 table_features;
-
-	/* How many rows were in the sample that built this histogram? */
-	float4 sample_features;
-
-	/* How many not-Null/Empty features were in the sample? */
-	float4 not_null_features;
-
-	/* How many features actually got sampled in the histogram? */
-	float4 histogram_features;
-
-	/* How many cells in histogram? (sizex*sizey*sizez*sizem) */
-	float4 histogram_cells;
-
-	/* How many cells did those histogram features cover? */
-	/* Since we are pro-rating coverage, this number should */
-	/* now always equal histogram_features */
-	float4 cells_covered;
-
-	/* Variable length # of floats for histogram */
-	float4 value[1];
-} ND_STATS;
-
 typedef struct {
 	/* Saved state from std_typanalyze() */
 	AnalyzeAttrComputeStatsFunc std_compute_stats;
@@ -318,13 +249,12 @@ text_p_get_mode(const text *txt)
 	char *modestr;
 	if (VARSIZE_ANY_EXHDR(txt) <= 0)
 		return mode;
-	modestr = (char*)VARDATA(txt);
-	if ( modestr[0] == 'N' )
+	modestr = (char *)VARDATA(txt);
+	if (modestr[0] == 'N')
 		mode = 0;
 	return mode;
 }
 
-
 /**
 * Integer comparison function for qsort
 */
@@ -372,7 +302,7 @@ total_double(const double *vals, int nvals)
 	int i;
 	float total = 0;
 	/* Calculate total */
-	for ( i = 0; i < nvals; i++ )
+	for (i = 0; i < nvals; i++)
 		total += vals[i];
 
 	return total;
@@ -425,33 +355,6 @@ stddev(const int *vals, int nvals)
 }
 #endif /* POSTGIS_DEBUG_LEVEL >= 3 */
 
-/**
-* Given a position in the n-d histogram (i,j,k) return the
-* position in the 1-d values array.
-*/
-static int
-nd_stats_value_index(const ND_STATS *stats, int *indexes)
-{
-	int d;
-	int accum = 1, vdx = 0;
-
-	/* Calculate the index into the 1-d values array that the (i,j,k,l) */
-	/* n-d histogram coordinate implies. */
-	/* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
-	for ( d = 0; d < (int)(stats->ndims); d++ )
-	{
-		int size = (int)(stats->size[d]);
-		if ( indexes[d] < 0 || indexes[d] >= size )
-		{
-			POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
-			return -1;
-		}
-		vdx += indexes[d] * accum;
-		accum *= size;
-	}
-	return vdx;
-}
-
 /**
 * Convert an #ND_BOX to a JSON string for printing
 */
@@ -722,50 +625,6 @@ nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
 	return true;
 }
 
-/**
-* Returns the proportion of b2 that is covered by b1.
-*/
-static inline double
-nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
-{
-	int d;
-	bool covered = true;
-	double ivol = 1.0;
-	double vol2 = 1.0;
-
-	for ( d = 0 ; d < ndims; d++ )
-	{
-		if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
-			return 0.0; /* Disjoint */
-
-		if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
-			covered = false;
-	}
-
-	if ( covered )
-		return 1.0;
-
-	for ( d = 0; d < ndims; d++ )
-	{
-		double width2 = b2->max[d] - b2->min[d];
-		double imin, imax, iwidth;
-
-		vol2 *= width2;
-
-		imin = Max(b1->min[d], b2->min[d]);
-		imax = Min(b1->max[d], b2->max[d]);
-		iwidth = imax - imin;
-		iwidth = Max(0.0, iwidth);
-
-		ivol *= iwidth;
-	}
-
-	if ( vol2 == 0.0 )
-		return vol2;
-
-	return ivol / vol2;
-}
-
 /* How many bins shall we use in figuring out the distribution? */
 #define MAX_NUM_BINS 50
 #define BIN_MIN_SIZE 10
@@ -894,9 +753,9 @@ nd_increment(ND_IBOX *ibox, int ndims, int *counter)
 {
 	int d = 0;
 
-	while ( d < ndims )
+	while (d < ndims)
 	{
-		if ( counter[d] < ibox->max[d] )
+		if (counter[d] < ibox->max[d])
 		{
 			counter[d] += 1;
 			break;
@@ -905,7 +764,7 @@ nd_increment(ND_IBOX *ibox, int ndims, int *counter)
 		d++;
 	}
 	/* That's it, cannot increment any more! */
-	if ( d == ndims )
+	if (d == ndims)
 		return false;
 
 	/* Increment complete! */
@@ -1321,9 +1180,9 @@ gserialized_joinsel_internal(PlannerInfo *root, List *args, JoinType jointype, i
 PG_FUNCTION_INFO_V1(gserialized_gist_joinsel);
 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
 {
-	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+	PlannerInfo *root = (PlannerInfo *)PG_GETARG_POINTER(0);
 	/* Oid operator = PG_GETARG_OID(1); */
-	List *args = (List *) PG_GETARG_POINTER(2);
+	List *args = (List *)PG_GETARG_POINTER(2);
 	JoinType jointype = (JoinType) PG_GETARG_INT16(3);
 	int mode = PG_GETARG_INT32(4);
 
@@ -1512,22 +1371,13 @@ compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfu
 #endif
 	}
 
-	/*
-	 * We'll build a histogram having stats->attr->attstattarget
-	 * (default 100) cells on each side,  within reason...
-	 * we'll use ndims*100000 as the maximum number of cells.
-	 * Also, if we're sampling a relatively small table, we'll try to ensure that
-	 * we have a smaller grid.
-	 */
 #if POSTGIS_PGSQL_VERSION >= 170
-	histo_cells_target = (int)pow((double)(stats->attstattarget), (double)ndims);
 	POSTGIS_DEBUGF(3, " stats->attstattarget: %d", stats->attstattarget);
+	histo_cells_target = histogram_cell_budget(total_rows, ndims, stats->attstattarget);
 #else
-	histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
 	POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
+	histo_cells_target = histogram_cell_budget(total_rows, ndims, stats->attr->attstattarget);
 #endif
-	histo_cells_target = Min(histo_cells_target, ndims * 100000);
-	histo_cells_target = Min(histo_cells_target, (int)(10 * ndims * total_rows));
 	POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
 
 	/* If there's no useful features, we can't work out stats */
@@ -1836,8 +1686,6 @@ compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfu
 
 	return;
 }
-
-
 /**
 * In order to do useful selectivity calculations in both 2-D and N-D
 * modes, we actually have to generate two stats objects, one for 2-D
@@ -1875,7 +1723,6 @@ compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 	}
 }
 
-
 /**
 * This function will be called when the ANALYZE command is run
 * on a column of the "geometry" or "geography" type.
diff --git a/postgis/gserialized_estimate_support.h b/postgis/gserialized_estimate_support.h
new file mode 100644
index 000000000..0d3a23d75
--- /dev/null
+++ b/postgis/gserialized_estimate_support.h
@@ -0,0 +1,197 @@
+/**********************************************************************
+ *
+ * PostGIS - Spatial Types for PostgreSQL
+ * http://postgis.net
+ *
+ * This file is part of PostGIS
+ *
+ * PostGIS is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * PostGIS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PostGIS.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************
+ *
+ * Internal helpers shared between the gserialized selectivity
+ * implementation and the unit tests.
+ *
+ * Keeping the routines header-only ensures the planner code and the
+ * harness evaluate the exact same floating-point flows without the
+ * cross-object plumbing that previously complicated maintenance.
+ * Nothing here is installed; the header is meant for
+ * gserialized_estimate.c and for the dedicated CUnit suite only.
+ *
+ **********************************************************************
+ *
+ * Copyright 2012 (C) Paul Ramsey <pramsey at cleverelephant.ca>
+ * Copyright 2025 (C) Darafei Praliaskouski <me at komzpa.net>
+ *
+ **********************************************************************/
+
+#ifndef POSTGIS_GSERIALIZED_ESTIMATE_SUPPORT_H
+#define POSTGIS_GSERIALIZED_ESTIMATE_SUPPORT_H
+
+#include "postgres.h"
+
+#include <limits.h>
+#include <math.h>
+
+/* The maximum number of dimensions our statistics code supports. */
+#define ND_DIMS 4
+
+/* Lightweight n-dimensional box representation for selectivity math. */
+typedef struct ND_BOX_T {
+	float4 min[ND_DIMS];
+	float4 max[ND_DIMS];
+} ND_BOX;
+
+/* Integer counterpart used for histogram cell iteration. */
+typedef struct ND_IBOX_T {
+	int min[ND_DIMS];
+	int max[ND_DIMS];
+} ND_IBOX;
+
+/* On-disk representation of the histogram emitted by ANALYZE. */
+typedef struct ND_STATS_T {
+	float4 ndims;
+	float4 size[ND_DIMS];
+	ND_BOX extent;
+	float4 table_features;
+	float4 sample_features;
+	float4 not_null_features;
+	float4 histogram_features;
+	float4 histogram_cells;
+	float4 cells_covered;
+	float4 value[1];
+} ND_STATS;
+
+/*
+ * Return the flattened index for the histogram coordinate expressed by
+ * 'indexes'.  A negative result signals that one of the axes fell outside
+ * the histogram definition.
+ */
+static inline int
+nd_stats_value_index(const ND_STATS *stats, const int *indexes)
+{
+	int d;
+	int accum = 1;
+	int vdx = 0;
+
+	for (d = 0; d < (int)(stats->ndims); d++)
+	{
+		int size = (int)(stats->size[d]);
+		if (indexes[d] < 0 || indexes[d] >= size)
+			return -1;
+		vdx += indexes[d] * accum;
+		accum *= size;
+	}
+	return vdx;
+}
+
+/*
+ * Derive the histogram grid budget requested by PostgreSQL's ANALYZE machinery.
+ * The planner caps the cell count via three heuristics that take the requested
+ * attstattarget, the histogram dimensionality, and the underlying row count
+ * into account.  Double precision arithmetic keeps the intermediate products in
+ * range so the cap behaves consistently across build architectures.
+ */
+static inline int
+histogram_cell_budget(double total_rows, int ndims, int attstattarget)
+{
+	double budget;
+	double dims_cap;
+	double rows_cap;
+	double attstat;
+	double dims;
+
+	if (ndims <= 0)
+		return 0;
+
+	if (attstattarget <= 0)
+		attstattarget = 1;
+
+	/* Requested resolution coming from PostgreSQL's ANALYZE knob. */
+	attstat = (double)attstattarget;
+	dims = (double)ndims;
+	budget = pow(attstat, dims);
+
+	/* Hard ceiling that keeps the statistics collector responsive. */
+	dims_cap = (double)ndims * 100000.0;
+	if (budget > dims_cap)
+		budget = dims_cap;
+
+	/* Small relations do not need a histogram that dwarfs the sample. */
+	if (total_rows <= 0.0)
+		return 0;
+
+	rows_cap = 10.0 * (double)ndims * total_rows;
+	if (rows_cap < 0.0)
+		rows_cap = 0.0;
+
+	/* Keep intermediate computations in double precision before clamping. */
+	if (rows_cap > (double)INT_MAX)
+		rows_cap = (double)INT_MAX;
+
+	if (budget > rows_cap)
+		budget = rows_cap;
+
+	if (budget >= (double)INT_MAX)
+		return INT_MAX;
+	if (budget <= 0.0)
+		return 0;
+
+	return (int)budget;
+}
+
+/*
+ * Compute the portion of 'target' covered by 'cover'.  The caller supplies the
+ * dimensionality because ND_BOX always carries four slots.  Degenerate volumes
+ * fold to zero, allowing the callers to detect slabs that ANALYZE sometimes
+ * emits for skewed datasets.
+ */
+static inline double
+nd_box_ratio(const ND_BOX *cover, const ND_BOX *target, int ndims)
+{
+	int d;
+	bool fully_covered = true;
+	double ivol = 1.0;
+	double refvol = 1.0;
+
+	for (d = 0; d < ndims; d++)
+	{
+		if (cover->max[d] <= target->min[d] || cover->min[d] >= target->max[d])
+			return 0.0; /* Disjoint */
+
+		if (cover->min[d] > target->min[d] || cover->max[d] < target->max[d])
+			fully_covered = false;
+	}
+
+	if (fully_covered)
+		return 1.0;
+
+	for (d = 0; d < ndims; d++)
+	{
+		double width = target->max[d] - target->min[d];
+		double imin = Max(cover->min[d], target->min[d]);
+		double imax = Min(cover->max[d], target->max[d]);
+		double iwidth = Max(0.0, imax - imin);
+
+		refvol *= width;
+		ivol *= iwidth;
+	}
+
+	if (refvol == 0.0)
+		return refvol;
+
+	return ivol / refvol;
+}
+
+#endif /* POSTGIS_GSERIALIZED_ESTIMATE_SUPPORT_H */

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                   |   3 +-
 configure.ac                           |   1 +
 postgis/cunit/Makefile.in              |  43 ++++++
 postgis/cunit/cu_tester.c              | 173 ++++++++++++++++++++++++
 postgis/gserialized_estimate.c         | 192 +++-----------------------
 postgis/gserialized_estimate_support.h | 237 +++++++++++++++++++++++++++++++++
 6 files changed, 474 insertions(+), 175 deletions(-)
 create mode 100644 postgis/cunit/Makefile.in
 create mode 100644 postgis/cunit/cu_tester.c
 create mode 100644 postgis/gserialized_estimate_support.h


hooks/post-receive
-- 
PostGIS


More information about the postgis-tickets mailing list