[postgis-tickets] r16604 - Let KMeans init even if there are only duplicates in input

Darafei komzpa at gmail.com
Sat Jun 2 05:09:35 PDT 2018


Author: komzpa
Date: 2018-06-02 05:09:35 -0700 (Sat, 02 Jun 2018)
New Revision: 16604

Modified:
   trunk/liblwgeom/lwkmeans.c
   trunk/regress/cluster.sql
   trunk/regress/cluster_expected
Log:
Let KMeans init even if there are only duplicates in input

Added reporting of duplicates noticed on init pass.

Closes #4100
Closes https://github.com/postgis/postgis/pull/253



Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c	2018-06-02 05:39:23 UTC (rev 16603)
+++ trunk/liblwgeom/lwkmeans.c	2018-06-02 12:09:35 UTC (rev 16604)
@@ -131,6 +131,7 @@
 	double* distances;
 	uint32_t p1 = 0, p2 = 0;
 	uint32_t i, j;
+	uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
 	double max_dst = -1;
 	double dst_p1, dst_p2;
 
@@ -150,7 +151,7 @@
 	}
 
 	/* k >= 2: find two distant points greedily */
-	for (i = 0; i < n; i++)
+	for (i = 1; i < n; i++)
 	{
 		/* skip null */
 		if (!objs[i]) continue;
@@ -174,7 +175,13 @@
 			else
 				p1 = i;
 		}
+		if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++;
 	}
+	if (duplicate_count > 1)
+		lwnotice(
+		    "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
+		    __func__,
+		    duplicate_count);
 
 	/* by now two points should be found and non-same */
 	assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);

Modified: trunk/regress/cluster.sql
===================================================================
--- trunk/regress/cluster.sql	2018-06-02 05:39:23 UTC (rev 16603)
+++ trunk/regress/cluster.sql	2018-06-02 12:09:35 UTC (rev 16604)
@@ -36,8 +36,17 @@
 SELECT 't103', id, ST_ClusterDBSCAN(geom, eps := 0.6, minpoints := 3) OVER () from dbscan_inputs;
 
 -- #3612
-SELECT 't3612a', ST_ClusterDBSCAN(foo1.the_geom, 20.1, 5)OVER()  As result
+SELECT '#3612a', ST_ClusterDBSCAN(foo1.the_geom, 20.1, 5)OVER()  As result
 							FROM ((SELECT geom  As the_geom
 									FROM (VALUES ( ST_GeomFromEWKT('SRID=4326;POLYGONM((-71.1319 42.2503 1,-71.132 42.2502 3,-71.1323 42.2504 -2,-71.1322 42.2505 1,-71.1319 42.2503 0))') ),
 											( ST_GeomFromEWKT('SRID=4326;POLYGONM((-71.1319 42.2512 0,-71.1318 42.2511 20,-71.1317 42.2511 -20,-71.1317 42.251 5,-71.1317 42.2509 4,-71.132 42.2511 6,-71.1319 42.2512 30))') ) ) As g(geom))) As foo1 LIMIT 3;
-SELECT 't3612b', ST_ClusterDBSCAN( ST_Point(1,1), 20.1, 5) OVER();
+SELECT '#3612b', ST_ClusterDBSCAN(ST_Point(1,1), 20.1, 5) OVER();
+
+
+-- ST_ClusterKMeans
+select '#4100a', count(distinct result) from (SELECT ST_ClusterKMeans(foo1.the_geom, 3)OVER()  As result
+  FROM ((SELECT ST_Collect(geom)  As the_geom
+		FROM (VALUES ( ST_GeomFromEWKT('SRID=4326;MULTIPOLYGON(((-71.0821 42.3036 2,-71.0822 42.3036 2,-71.082 42.3038 2,-71.0819 42.3037 2,-71.0821 42.3036 2)))') ),
+	( ST_GeomFromEWKT('SRID=4326;POLYGON((-71.1261 42.2703 1,-71.1257 42.2703 1,-71.1257 42.2701 1,-71.126 42.2701 1,-71.1261 42.2702 1,-71.1261 42.2703 1))') ) ) As g(geom) CROSS JOIN generate_series(1,3) As i GROUP BY i )) As foo1 LIMIT 10) kmeans;
+
+select '#4100b', count(distinct cid) from (select ST_ClusterKMeans(geom,2) over () as cid from (values ('POINT(0 0)'::geometry), ('POINT(0 0)')) g(geom)) kmeans;

Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected	2018-06-02 05:39:23 UTC (rev 16603)
+++ trunk/regress/cluster_expected	2018-06-02 12:09:35 UTC (rev 16604)
@@ -27,6 +27,10 @@
 t103|4|0
 t103|5|0
 t103|6|0
-t3612a|
-t3612a|
-t3612b|
+#3612a|
+#3612a|
+#3612b|
+NOTICE:  kmeans_init: there are at least 3 duplicate inputs, number of output clusters may be less than you requested
+#4100a|1
+NOTICE:  kmeans_init: there are at least 2 duplicate inputs, number of output clusters may be less than you requested
+#4100b|1



More information about the postgis-tickets mailing list