[postgis-tickets] r16605 - ST_ClusterKMeans: handle effective K=0 when all the inputs are EMPTY

Darafei komzpa at gmail.com
Sun Jun 3 02:11:32 PDT 2018


Author: komzpa
Date: 2018-06-03 02:11:32 -0700 (Sun, 03 Jun 2018)
New Revision: 16605

Modified:
   trunk/liblwgeom/lwkmeans.c
   trunk/regress/cluster.sql
   trunk/regress/cluster_expected
Log:
ST_ClusterKMeans: handle effective K=0 when all the inputs are EMPTY

Closes #4101
Closes https://github.com/postgis/postgis/pull/254



Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c	2018-06-02 12:09:35 UTC (rev 16604)
+++ trunk/liblwgeom/lwkmeans.c	2018-06-03 09:11:32 UTC (rev 16605)
@@ -135,21 +135,9 @@
 	double max_dst = -1;
 	double dst_p1, dst_p2;
 
-	assert(k > 0);
+	/* k=0, k=1: "clustering" is just input validation */
+	assert(k > 1);
 
-	/* k = 1: first non-null is ok, and input check guarantees there's one */
-	if (k == 1)
-	{
-		for (i = 0; i < n; i++)
-		{
-			if (!objs[i]) continue;
-			centers_raw[0] = *((POINT2D *)objs[i]);
-			centers[0] = &(centers_raw[0]);
-			return;
-		}
-		assert(0);
-	}
-
 	/* k >= 2: find two distant points greedily */
 	for (i = 1; i < n; i++)
 	{
@@ -333,10 +321,25 @@
 		k = num_non_empty;
 	}
 
-	kmeans_init(objs, clusters, n, centers, centers_raw, k);
+	if (k > 1)
+	{
+		kmeans_init(objs, clusters, n, centers, centers_raw, k);
+		result = kmeans(objs, clusters, n, centers, k);
+	}
+	else
+	{
+		/* k=0: everythong is unclusterable
+		 * k=1: mark up NULL and non-NULL */
+		for (i = 0; i < n; i++)
+		{
+			if (k == 0 || !objs[i])
+				clusters[i] = KMEANS_NULL_CLUSTER;
+			else
+				clusters[i] = 0;
+		}
+		result = LW_TRUE;
+	}
 
-	result = kmeans(objs, clusters, n, centers, k);
-
 	/* Before error handling, might as well clean up all the inputs */
 	lwfree(objs);
 	lwfree(centers);

Modified: trunk/regress/cluster.sql
===================================================================
--- trunk/regress/cluster.sql	2018-06-02 12:09:35 UTC (rev 16604)
+++ trunk/regress/cluster.sql	2018-06-03 09:11:32 UTC (rev 16605)
@@ -50,3 +50,13 @@
 	( ST_GeomFromEWKT('SRID=4326;POLYGON((-71.1261 42.2703 1,-71.1257 42.2703 1,-71.1257 42.2701 1,-71.126 42.2701 1,-71.1261 42.2702 1,-71.1261 42.2703 1))') ) ) As g(geom) CROSS JOIN generate_series(1,3) As i GROUP BY i )) As foo1 LIMIT 10) kmeans;
 
 select '#4100b', count(distinct cid) from (select ST_ClusterKMeans(geom,2) over () as cid from (values ('POINT(0 0)'::geometry), ('POINT(0 0)')) g(geom)) kmeans;
+
+
+select '#4101a', count(distinct result) from (SELECT ST_ClusterKMeans(foo1.the_geom, 3) OVER()  As result
+							FROM ((SELECT ST_GeomFromText('POINT EMPTY',4326) As the_geom
+			UNION ALL SELECT ST_GeomFromText('MULTIPOINT EMPTY',4326) As the_geom
+			UNION ALL SELECT ST_GeomFromText('MULTIPOLYGON EMPTY',4326) As the_geom
+			UNION ALL SELECT ST_GeomFromText('LINESTRING EMPTY',4326) As the_geom
+			UNION ALL SELECT ST_GeomFromText('MULTILINESTRING EMPTY',4326) As the_geom ) ) As foo1 LIMIT 10) kmeans;
+
+select '#4101b', count(distinct cid) from (select ST_ClusterKMeans(geom,2) over () as cid from (values ('POINT EMPTY'::geometry), ('POINT EMPTY')) g(geom)) kmeans;

Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected	2018-06-02 12:09:35 UTC (rev 16604)
+++ trunk/regress/cluster_expected	2018-06-03 09:11:32 UTC (rev 16605)
@@ -34,3 +34,7 @@
 #4100a|1
 NOTICE:  kmeans_init: there are at least 2 duplicate inputs, number of output clusters may be less than you requested
 #4100b|1
+NOTICE:  lwgeom_cluster_2d_kmeans: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data
+#4101a|1
+NOTICE:  lwgeom_cluster_2d_kmeans: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data
+#4101b|1



More information about the postgis-tickets mailing list