[postgis-tickets] r16604 - Let KMeans init even if there are only duplicates in input
Darafei
komzpa at gmail.com
Sat Jun 2 05:09:35 PDT 2018
Author: komzpa
Date: 2018-06-02 05:09:35 -0700 (Sat, 02 Jun 2018)
New Revision: 16604
Modified:
trunk/liblwgeom/lwkmeans.c
trunk/regress/cluster.sql
trunk/regress/cluster_expected
Log:
Let KMeans init even if there are only duplicates in input
Added reporting of duplicates noticed on init pass.
Closes #4100
Closes https://github.com/postgis/postgis/pull/253
Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c 2018-06-02 05:39:23 UTC (rev 16603)
+++ trunk/liblwgeom/lwkmeans.c 2018-06-02 12:09:35 UTC (rev 16604)
@@ -131,6 +131,7 @@
double* distances;
uint32_t p1 = 0, p2 = 0;
uint32_t i, j;
+ uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
double max_dst = -1;
double dst_p1, dst_p2;
@@ -150,7 +151,7 @@
}
/* k >= 2: find two distant points greedily */
- for (i = 0; i < n; i++)
+ for (i = 1; i < n; i++)
{
/* skip null */
if (!objs[i]) continue;
@@ -174,7 +175,13 @@
else
p1 = i;
}
+ if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++;
}
+ if (duplicate_count > 1)
+ lwnotice(
+ "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
+ __func__,
+ duplicate_count);
/* by now two points should be found and non-same */
assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
Modified: trunk/regress/cluster.sql
===================================================================
--- trunk/regress/cluster.sql 2018-06-02 05:39:23 UTC (rev 16603)
+++ trunk/regress/cluster.sql 2018-06-02 12:09:35 UTC (rev 16604)
@@ -36,8 +36,17 @@
SELECT 't103', id, ST_ClusterDBSCAN(geom, eps := 0.6, minpoints := 3) OVER () from dbscan_inputs;
-- #3612
-SELECT 't3612a', ST_ClusterDBSCAN(foo1.the_geom, 20.1, 5)OVER() As result
+SELECT '#3612a', ST_ClusterDBSCAN(foo1.the_geom, 20.1, 5)OVER() As result
FROM ((SELECT geom As the_geom
FROM (VALUES ( ST_GeomFromEWKT('SRID=4326;POLYGONM((-71.1319 42.2503 1,-71.132 42.2502 3,-71.1323 42.2504 -2,-71.1322 42.2505 1,-71.1319 42.2503 0))') ),
( ST_GeomFromEWKT('SRID=4326;POLYGONM((-71.1319 42.2512 0,-71.1318 42.2511 20,-71.1317 42.2511 -20,-71.1317 42.251 5,-71.1317 42.2509 4,-71.132 42.2511 6,-71.1319 42.2512 30))') ) ) As g(geom))) As foo1 LIMIT 3;
-SELECT 't3612b', ST_ClusterDBSCAN( ST_Point(1,1), 20.1, 5) OVER();
+SELECT '#3612b', ST_ClusterDBSCAN(ST_Point(1,1), 20.1, 5) OVER();
+
+
+-- ST_ClusterKMeans
+select '#4100a', count(distinct result) from (SELECT ST_ClusterKMeans(foo1.the_geom, 3)OVER() As result
+ FROM ((SELECT ST_Collect(geom) As the_geom
+ FROM (VALUES ( ST_GeomFromEWKT('SRID=4326;MULTIPOLYGON(((-71.0821 42.3036 2,-71.0822 42.3036 2,-71.082 42.3038 2,-71.0819 42.3037 2,-71.0821 42.3036 2)))') ),
+ ( ST_GeomFromEWKT('SRID=4326;POLYGON((-71.1261 42.2703 1,-71.1257 42.2703 1,-71.1257 42.2701 1,-71.126 42.2701 1,-71.1261 42.2702 1,-71.1261 42.2703 1))') ) ) As g(geom) CROSS JOIN generate_series(1,3) As i GROUP BY i )) As foo1 LIMIT 10) kmeans;
+
+select '#4100b', count(distinct cid) from (select ST_ClusterKMeans(geom,2) over () as cid from (values ('POINT(0 0)'::geometry), ('POINT(0 0)')) g(geom)) kmeans;
Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected 2018-06-02 05:39:23 UTC (rev 16603)
+++ trunk/regress/cluster_expected 2018-06-02 12:09:35 UTC (rev 16604)
@@ -27,6 +27,10 @@
t103|4|0
t103|5|0
t103|6|0
-t3612a|
-t3612a|
-t3612b|
+#3612a|
+#3612a|
+#3612b|
+NOTICE: kmeans_init: there are at least 3 duplicate inputs, number of output clusters may be less than you requested
+#4100a|1
+NOTICE: kmeans_init: there are at least 2 duplicate inputs, number of output clusters may be less than you requested
+#4100b|1
More information about the postgis-tickets
mailing list