[postgis-tickets] r16548 - ST_ClusterKMeans k=2 is now faster and converges better.

Darafei komzpa at gmail.com
Sat Apr 21 05:36:51 PDT 2018


Author: komzpa
Date: 2018-04-21 05:36:51 -0700 (Sat, 21 Apr 2018)
New Revision: 16548

Modified:
   trunk/NEWS
   trunk/liblwgeom/lwkmeans.c
   trunk/liblwgeom/measures.c
   trunk/regress/lwgeom_regress.sql
   trunk/regress/lwgeom_regress_expected
   trunk/regress/tickets.sql
   trunk/regress/tickets_expected
Log:
ST_ClusterKMeans k=2 is now faster and converges better.

This discovered and fixes issue in NULL handling.

Closes #4071
Closes https://github.com/postgis/postgis/pull/238



Modified: trunk/NEWS
===================================================================
--- trunk/NEWS	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/NEWS	2018-04-21 12:36:51 UTC (rev 16548)
@@ -50,7 +50,7 @@
            PolyhedralSurface (Matthias Bay)
   - #2508, ST_OffsetCurve now works with collections (Darafei Praliaskouski)
   - #4006, ST_GeomFromGeoJSON support for json and jsonb as input
-          (Paul Ramsey, Regina Obe)
+           (Paul Ramsey, Regina Obe)
   - #4037, Invalid input geometry is fixed with MakeValid for GEOS exceptions in
            ST_Intersection, ST_Union, ST_Difference, ST_SymDifference (Darafei
            Praliaskouski)
@@ -59,6 +59,7 @@
            robustness issues. (Darafei Praliaskouski)
   - #4025, #4032 Fixed precision issue in ST_ClosestPointOfApproach,
            ST_DistanceCPA, and ST_CPAWithin (Paul Ramsey, Darafei Praliaskouski)
+  - #4071, ST_ClusterKMeans crash on NULL/EMPTY fixed (Darafei Praliaskouski)
 
 
 PostGIS 2.4.0

Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/liblwgeom/lwkmeans.c	2018-04-21 12:36:51 UTC (rev 16548)
@@ -11,7 +11,7 @@
 #include "liblwgeom_internal.h"
 
 /*
- * When clustering lists with NULL elements, they will get this as
+ * When clustering lists with NULL or EMPTY elements, they will get this as
  * their cluster number. (All the other clusters will be non-negative)
  */
 #define KMEANS_NULL_CLUSTER -1
@@ -65,8 +65,9 @@
 update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t* weights, uint32_t k)
 {
 	uint32_t i;
+	int cluster;
 
-	memset(weights, 0, sizeof(int) * k);
+	memset(weights, 0, sizeof(uint32_t) * k);
 	for (i = 0; i < k; i++)
 	{
 		centers[i]->x = 0.0;
@@ -74,9 +75,13 @@
 	}
 	for (i = 0; i < n; i++)
 	{
-		centers[clusters[i]]->x += objs[i]->x;
-		centers[clusters[i]]->y += objs[i]->y;
-		weights[clusters[i]] += 1;
+		cluster = clusters[i];
+		if (cluster != KMEANS_NULL_CLUSTER)
+		{
+			centers[cluster]->x += objs[i]->x;
+			centers[cluster]->y += objs[i]->y;
+			weights[cluster] += 1;
+		}
 	}
 	for (i = 0; i < k; i++)
 	{
@@ -96,10 +101,7 @@
 
 	weights = lwalloc(sizeof(int) * k);
 
-	/*
-	 * Previous cluster state array. At this time, r doesn't mean anything
-	 * but it's ok
-	 */
+	/* previous cluster state array */
 	clusters_last = lwalloc(clusters_sz);
 
 	for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
@@ -106,7 +108,7 @@
 	{
 		LW_ON_INTERRUPT(break);
 
-		/* Store the previous state of the clustering */
+		/* store the previous state of the clustering */
 		memcpy(clusters_last, clusters, clusters_sz);
 
 		update_r(objs, clusters, n, centers, k);
@@ -123,22 +125,132 @@
 	return converged;
 }
 
+static void
+kmeans_init(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, POINT2D* centers_raw, uint32_t k)
+{
+	double* distances;
+	uint32_t p1 = 0, p2 = 0;
+	uint32_t i, j;
+	double max_dst = -1;
+	double dst_p1, dst_p2;
+
+	assert(k > 0);
+
+	/* k = 1: first non-null is ok, and input check guarantees there's one */
+	if (k == 1)
+	{
+		for (i = 0; i < n; i++)
+		{
+			if (!objs[i]) continue;
+			centers_raw[0] = *((POINT2D *)objs[i]);
+			centers[0] = &(centers_raw[0]);
+			return;
+		}
+		assert(0);
+	}
+
+	/* k >= 2: find two distant points greedily */
+	for (i = 0; i < n; i++)
+	{
+		/* skip null */
+		if (!objs[i]) continue;
+
+		/* reinit if first element happened to be null */
+		if (!objs[p1] && !objs[p2])
+		{
+			p1 = i;
+			p2 = i;
+			continue;
+		}
+
+		/* if we found a larger distance, replace our choice */
+		dst_p1 = distance2d_sqr_pt_pt(objs[i], objs[p1]);
+		dst_p2 = distance2d_sqr_pt_pt(objs[i], objs[p2]);
+		if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
+		{
+			max_dst = fmax(dst_p1, dst_p2);
+			if (dst_p1 > dst_p2)
+				p2 = i;
+			else
+				p1 = i;
+		}
+	}
+
+	/* by now two points should be found and non-same */
+	assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
+
+	/* accept these two points */
+	centers_raw[0] = *((POINT2D *)objs[p1]);
+	centers[0] = &(centers_raw[0]);
+	centers_raw[1] = *((POINT2D *)objs[p2]);
+	centers[1] = &(centers_raw[1]);
+
+	if (k > 2)
+	{
+		/* array of minimum distance to a point from accepted cluster centers */
+		distances = lwalloc(sizeof(double) * n);
+
+		/* initialize array with distance to first object */
+		for (j = 0; j < n; j++)
+		{
+			if (objs[j])
+				distances[j] = distance2d_sqr_pt_pt(objs[j], centers[0]);
+			else
+				distances[j] = -1;
+		}
+		distances[p1] = -1;
+		distances[p2] = -1;
+
+		/* loop i on clusters, skip 0 and 1 as found already */
+		for (i = 2; i < k; i++)
+		{
+			uint32_t candidate_center = 0;
+			double max_distance = -DBL_MAX;
+
+			/* loop j on objs */
+			for (j = 0; j < n; j++)
+			{
+				/* empty objs and accepted clusters are already marked with distance = -1 */
+				if (distances[j] < 0) continue;
+
+				/* update minimal distance with previosuly accepted cluster */
+				distances[j] = fmin(distance2d_sqr_pt_pt(objs[j], centers[i - 1]), distances[j]);
+
+				/* greedily take a point that's farthest from any of accepted clusters */
+				if (distances[j] > max_distance)
+				{
+					candidate_center = j;
+					max_distance = distances[j];
+				}
+			}
+
+			/* Checked earlier by counting entries on input, just in case */
+			assert(max_distance >= 0);
+
+			/* accept candidate to centers */
+			distances[candidate_center] = -1;
+			/* Copy the point coordinates into the initial centers array
+			 * Centers array is an array of pointers to points, not an array of points */
+			centers_raw[i] = *((POINT2D *)objs[candidate_center]);
+			centers[i] = &(centers_raw[i]);
+		}
+		lwfree(distances);
+	}
+}
+
 int*
 lwgeom_cluster_2d_kmeans(const LWGEOM** geoms, uint32_t n, uint32_t k)
 {
 	uint32_t i;
 	uint32_t num_centroids = 0;
+	uint32_t num_non_empty = 0;
 	LWGEOM** centroids;
 	POINT2D* centers_raw;
-	double* distances;
 	const POINT2D* cp;
 	int result = LW_FALSE;
-	uint32_t boundary_point_idx = 0;
-	double max_norm = -DBL_MAX;
-	double curr_norm;
 
 	/* An array of objects to be analyzed.
-	 * All NULL values will be returned in the  KMEANS_NULL_CLUSTER. */
+	 * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
 	POINT2D** objs;
 
 	/* An array of centers for the algorithm. */
@@ -152,7 +264,10 @@
 	assert(geoms);
 
 	if (n < k)
-		lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
+	{
+		lwerror("%s: number of geometries is less than the number of clusters requested, not all clusters will get data", __func__);
+		k = n;
+	}
 
 	/* We'll hold the temporary centroid objects here */
 	centroids = lwalloc(sizeof(LWGEOM*) * n);
@@ -173,9 +288,6 @@
 	memset(clusters, 0, sizeof(int) * n);
 	memset(centers, 0, sizeof(POINT2D*) * k);
 
-	/* Array of sums of distances to a point from accepted cluster centers */
-	distances = lwalloc(sizeof(double) * n);
-
 	/* Prepare the list of object pointers for K-means */
 	for (i = 0; i < n; i++)
 	{
@@ -182,25 +294,18 @@
 		const LWGEOM* geom = geoms[i];
 		LWPOINT* lwpoint;
 
-		/* Null/empty geometries get a NULL pointer */
-		if ((!geom) || lwgeom_is_empty(geom))
-		{
-			objs[i] = NULL;
-			/* mark as unreachable */
-			distances[i] = -1;
-			continue;
-		}
+		/* Null/empty geometries get a NULL pointer, set earlier with memset */
+		if ((!geom) || lwgeom_is_empty(geom)) continue;
 
-		distances[i] = DBL_MAX;
-
 		/* If the input is a point, use its coordinates */
 		/* If its not a point, convert it to one via centroid */
 		if (lwgeom_get_type(geom) != POINTTYPE)
 		{
 			LWGEOM* centroid = lwgeom_centroid(geom);
-			if ((!centroid) || lwgeom_is_empty(centroid))
+			if ((!centroid)) continue;
+			if (lwgeom_is_empty(centroid))
 			{
-				objs[i] = NULL;
+				lwgeom_free(centroid);
 				continue;
 			}
 			centroids[num_centroids++] = centroid;
@@ -207,71 +312,22 @@
 			lwpoint = lwgeom_as_lwpoint(centroid);
 		}
 		else
-		{
 			lwpoint = lwgeom_as_lwpoint(geom);
-		}
 
 		/* Store a pointer to the POINT2D we are interested in */
 		cp = getPoint2d_cp(lwpoint->point, 0);
 		objs[i] = (POINT2D*)cp;
-
-		/* Find the point with largest Euclidean norm to use as seed */
-		curr_norm = (cp->x) * (cp->x) + (cp->y) * (cp->y);
-		if (curr_norm > max_norm)
-		{
-			boundary_point_idx = i;
-			max_norm = curr_norm;
-		}
+		num_non_empty++;
 	}
 
-	if (max_norm == -DBL_MAX)
+	if (num_non_empty < k)
 	{
-		lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
+		lwnotice("%s: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data", __func__);
+		k = num_non_empty;
 	}
 
-	/* start with point on boundary */
-	distances[boundary_point_idx] = -1;
-	centers_raw[0] = *((POINT2D*)objs[boundary_point_idx]);
-	centers[0] = &(centers_raw[0]);
-	/* loop i on clusters, skip 0 as it's found already */
-	for (i = 1; i < k; i++)
-	{
-		uint32_t j;
-		double max_distance = -DBL_MAX;
-		double curr_distance;
-		uint32_t candidate_center = 0;
+	kmeans_init(objs, clusters, n, centers, centers_raw, k);
 
-		/* loop j on objs */
-		for (j = 0; j < n; j++)
-		{
-			/* empty objs and accepted clusters are already marked with distance = -1 */
-			if (distances[j] < 0)
-				continue;
-
-			/* update distance to closest cluster */
-			curr_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
-			distances[j] = fmin(curr_distance, distances[j]);
-
-			/* greedily take a point that's farthest from any of accepted clusters */
-			if (distances[j] > max_distance)
-			{
-				candidate_center = j;
-				max_distance = distances[j];
-			}
-		}
-
-		/* something is wrong with data, cannot find a candidate */
-		if (max_distance < 0)
-			lwerror("unable to calculate cluster seed points, too many NULLs or empties?");
-
-		/* accept candidtate to centers */
-		distances[candidate_center] = -1;
-		/* Copy the point coordinates into the initial centers array
-		 * This is ugly, but the centers array is an array of pointers to points, not an array of points */
-		centers_raw[i] = *((POINT2D*)objs[candidate_center]);
-		centers[i] = &(centers_raw[i]);
-	}
-
 	result = kmeans(objs, clusters, n, centers, k);
 
 	/* Before error handling, might as well clean up all the inputs */
@@ -279,13 +335,11 @@
 	lwfree(centers);
 	lwfree(centers_raw);
 	lwfree(centroids);
-	lwfree(distances);
 
 	/* Good result */
-	if (result)
-		return clusters;
+	if (result) return clusters;
 
 	/* Bad result, not going to need the answer */
 	lwfree(clusters);
 	return NULL;
-}
\ No newline at end of file
+}

Modified: trunk/liblwgeom/measures.c
===================================================================
--- trunk/liblwgeom/measures.c	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/liblwgeom/measures.c	2018-04-21 12:36:51 UTC (rev 16548)
@@ -2308,35 +2308,24 @@
 End of Functions in common for Brute force and new calculation
 --------------------------------------------------------------------------------------------------------------*/
 
-
-/**
-The old function necessary for ptarray_segmentize2d in ptarray.c
-*/
-double
+inline double
 distance2d_pt_pt(const POINT2D *p1, const POINT2D *p2)
 {
 	double hside = p2->x - p1->x;
 	double vside = p2->y - p1->y;
 
-	return sqrt ( hside*hside + vside*vside );
-
+	return hypot(hside, vside);
 }
 
-double
+inline double
 distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
 {
 	double hside = p2->x - p1->x;
 	double vside = p2->y - p1->y;
 
-	return  hside*hside + vside*vside;
-
+	return hside * hside + vside * vside;
 }
 
-
-/**
-
-The old function necessary for ptarray_segmentize2d in ptarray.c
-*/
 double
 distance2d_pt_seg(const POINT2D *p, const POINT2D *A, const POINT2D *B)
 {

Modified: trunk/regress/lwgeom_regress.sql
===================================================================
--- trunk/regress/lwgeom_regress.sql	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/lwgeom_regress.sql	2018-04-21 12:36:51 UTC (rev 16548)
@@ -201,6 +201,14 @@
 order by count(*)
 limit 1;
 
+-- check that null and empty is handled in the clustering
+select '#4071', count(distinct a), count(distinct b), count(distinct c)  from
+(select
+	ST_ClusterKMeans(geom, 1) over () a,
+	ST_ClusterKMeans(geom, 2) over () b,
+	ST_ClusterKMeans(geom, 3) over () c
+from (values (null::geometry), ('POINT(1 1)'), ('POINT EMPTY'), ('POINT(0 0)'), ('POINT(4 4)')) as g (geom)) z;
+
 -- typmod checks
 select 'typmod_point_4326', geometry_typmod_out(geometry_typmod_in('{Point,4326}'));
 select 'typmod_point_0', geometry_typmod_out(geometry_typmod_in('{Point,0}'));

Modified: trunk/regress/lwgeom_regress_expected
===================================================================
--- trunk/regress/lwgeom_regress_expected	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/lwgeom_regress_expected	2018-04-21 12:36:51 UTC (rev 16548)
@@ -34,6 +34,7 @@
 ST_Angle_2_lines|4.71238898038469
 #3965|25|25
 #3971|t
+#4071|2|3|4
 typmod_point_4326|(Point,4326)
 typmod_point_0|(Point)
 NOTICE:  SRID value -1 converted to the officially unknown SRID value 0

Modified: trunk/regress/tickets.sql
===================================================================
--- trunk/regress/tickets.sql	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/tickets.sql	2018-04-21 12:36:51 UTC (rev 16548)
@@ -878,7 +878,7 @@
 WITH
   input AS (SELECT 'SRID=4326;POLYGON((26426 65078,26531 65242,26075 65136,26096 65427,26426 65078))'::geometry AS geom),
   mbc   AS (SELECT (mb).center, (mb).radius FROM (SELECT ST_MinimumBoundingRadius(geom) mb FROM input) sq)
-SELECT '#2996', radius = ST_Length(ST_LongestLine(geom, center)) FROM input, mbc;
+SELECT '#2996', radius::numeric(8,2), ST_Length(ST_LongestLine(geom, center))::numeric(8,2) FROM input, mbc;
 
 -- #3119 --
 SELECT '#3119a', floor(ST_LengthSpheroid('SRID=4326;LINESTRING (-72.640965 42.11867, -72.6395 42.1187)', 'SPHEROID["GRS_1980",6378137,298.257222101]'));

Modified: trunk/regress/tickets_expected
===================================================================
--- trunk/regress/tickets_expected	2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/tickets_expected	2018-04-21 12:36:51 UTC (rev 16548)
@@ -270,7 +270,7 @@
 #2870|Point[GS]
 #2956|t
 #2985|LINESTRING(20.9511664253809 52.3984560730436)
-#2996|t
+#2996|247.44|247.44
 #3119a|121
 #3119b|291
 #3119c|615



More information about the postgis-tickets mailing list