[postgis-tickets] r16548 - ST_ClusterKMeans k=2 is now faster and converges better.
Darafei
komzpa at gmail.com
Sat Apr 21 05:36:51 PDT 2018
Author: komzpa
Date: 2018-04-21 05:36:51 -0700 (Sat, 21 Apr 2018)
New Revision: 16548
Modified:
trunk/NEWS
trunk/liblwgeom/lwkmeans.c
trunk/liblwgeom/measures.c
trunk/regress/lwgeom_regress.sql
trunk/regress/lwgeom_regress_expected
trunk/regress/tickets.sql
trunk/regress/tickets_expected
Log:
ST_ClusterKMeans k=2 is now faster and converges better.
This discovered and fixes issue in NULL handling.
Closes #4071
Closes https://github.com/postgis/postgis/pull/238
Modified: trunk/NEWS
===================================================================
--- trunk/NEWS 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/NEWS 2018-04-21 12:36:51 UTC (rev 16548)
@@ -50,7 +50,7 @@
PolyhedralSurface (Matthias Bay)
- #2508, ST_OffsetCurve now works with collections (Darafei Praliaskouski)
- #4006, ST_GeomFromGeoJSON support for json and jsonb as input
- (Paul Ramsey, Regina Obe)
+ (Paul Ramsey, Regina Obe)
- #4037, Invalid input geometry is fixed with MakeValid for GEOS exceptions in
ST_Intersection, ST_Union, ST_Difference, ST_SymDifference (Darafei
Praliaskouski)
@@ -59,6 +59,7 @@
robustness issues. (Darafei Praliaskouski)
- #4025, #4032 Fixed precision issue in ST_ClosestPointOfApproach,
ST_DistanceCPA, and ST_CPAWithin (Paul Ramsey, Darafei Praliaskouski)
+ - #4071, ST_ClusterKMeans crash on NULL/EMPTY fixed (Darafei Praliaskouski)
PostGIS 2.4.0
Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/liblwgeom/lwkmeans.c 2018-04-21 12:36:51 UTC (rev 16548)
@@ -11,7 +11,7 @@
#include "liblwgeom_internal.h"
/*
- * When clustering lists with NULL elements, they will get this as
+ * When clustering lists with NULL or EMPTY elements, they will get this as
* their cluster number. (All the other clusters will be non-negative)
*/
#define KMEANS_NULL_CLUSTER -1
@@ -65,8 +65,9 @@
update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t* weights, uint32_t k)
{
uint32_t i;
+ int cluster;
- memset(weights, 0, sizeof(int) * k);
+ memset(weights, 0, sizeof(uint32_t) * k);
for (i = 0; i < k; i++)
{
centers[i]->x = 0.0;
@@ -74,9 +75,13 @@
}
for (i = 0; i < n; i++)
{
- centers[clusters[i]]->x += objs[i]->x;
- centers[clusters[i]]->y += objs[i]->y;
- weights[clusters[i]] += 1;
+ cluster = clusters[i];
+ if (cluster != KMEANS_NULL_CLUSTER)
+ {
+ centers[cluster]->x += objs[i]->x;
+ centers[cluster]->y += objs[i]->y;
+ weights[cluster] += 1;
+ }
}
for (i = 0; i < k; i++)
{
@@ -96,10 +101,7 @@
weights = lwalloc(sizeof(int) * k);
- /*
- * Previous cluster state array. At this time, r doesn't mean anything
- * but it's ok
- */
+ /* previous cluster state array */
clusters_last = lwalloc(clusters_sz);
for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
@@ -106,7 +108,7 @@
{
LW_ON_INTERRUPT(break);
- /* Store the previous state of the clustering */
+ /* store the previous state of the clustering */
memcpy(clusters_last, clusters, clusters_sz);
update_r(objs, clusters, n, centers, k);
@@ -123,22 +125,132 @@
return converged;
}
+static void
+kmeans_init(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, POINT2D* centers_raw, uint32_t k)
+{
+ double* distances;
+ uint32_t p1 = 0, p2 = 0;
+ uint32_t i, j;
+ double max_dst = -1;
+ double dst_p1, dst_p2;
+
+ assert(k > 0);
+
+ /* k = 1: first non-null is ok, and input check guarantees there's one */
+ if (k == 1)
+ {
+ for (i = 0; i < n; i++)
+ {
+ if (!objs[i]) continue;
+ centers_raw[0] = *((POINT2D *)objs[i]);
+ centers[0] = &(centers_raw[0]);
+ return;
+ }
+ assert(0);
+ }
+
+ /* k >= 2: find two distant points greedily */
+ for (i = 0; i < n; i++)
+ {
+ /* skip null */
+ if (!objs[i]) continue;
+
+ /* reinit if first element happened to be null */
+ if (!objs[p1] && !objs[p2])
+ {
+ p1 = i;
+ p2 = i;
+ continue;
+ }
+
+ /* if we found a larger distance, replace our choice */
+ dst_p1 = distance2d_sqr_pt_pt(objs[i], objs[p1]);
+ dst_p2 = distance2d_sqr_pt_pt(objs[i], objs[p2]);
+ if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
+ {
+ max_dst = fmax(dst_p1, dst_p2);
+ if (dst_p1 > dst_p2)
+ p2 = i;
+ else
+ p1 = i;
+ }
+ }
+
+ /* by now two points should be found and non-same */
+ assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
+
+ /* accept these two points */
+ centers_raw[0] = *((POINT2D *)objs[p1]);
+ centers[0] = &(centers_raw[0]);
+ centers_raw[1] = *((POINT2D *)objs[p2]);
+ centers[1] = &(centers_raw[1]);
+
+ if (k > 2)
+ {
+ /* array of minimum distance to a point from accepted cluster centers */
+ distances = lwalloc(sizeof(double) * n);
+
+ /* initialize array with distance to first object */
+ for (j = 0; j < n; j++)
+ {
+ if (objs[j])
+ distances[j] = distance2d_sqr_pt_pt(objs[j], centers[0]);
+ else
+ distances[j] = -1;
+ }
+ distances[p1] = -1;
+ distances[p2] = -1;
+
+ /* loop i on clusters, skip 0 and 1 as found already */
+ for (i = 2; i < k; i++)
+ {
+ uint32_t candidate_center = 0;
+ double max_distance = -DBL_MAX;
+
+ /* loop j on objs */
+ for (j = 0; j < n; j++)
+ {
+ /* empty objs and accepted clusters are already marked with distance = -1 */
+ if (distances[j] < 0) continue;
+
+ /* update minimal distance with previosuly accepted cluster */
+ distances[j] = fmin(distance2d_sqr_pt_pt(objs[j], centers[i - 1]), distances[j]);
+
+ /* greedily take a point that's farthest from any of accepted clusters */
+ if (distances[j] > max_distance)
+ {
+ candidate_center = j;
+ max_distance = distances[j];
+ }
+ }
+
+ /* Checked earlier by counting entries on input, just in case */
+ assert(max_distance >= 0);
+
+ /* accept candidate to centers */
+ distances[candidate_center] = -1;
+ /* Copy the point coordinates into the initial centers array
+ * Centers array is an array of pointers to points, not an array of points */
+ centers_raw[i] = *((POINT2D *)objs[candidate_center]);
+ centers[i] = &(centers_raw[i]);
+ }
+ lwfree(distances);
+ }
+}
+
int*
lwgeom_cluster_2d_kmeans(const LWGEOM** geoms, uint32_t n, uint32_t k)
{
uint32_t i;
uint32_t num_centroids = 0;
+ uint32_t num_non_empty = 0;
LWGEOM** centroids;
POINT2D* centers_raw;
- double* distances;
const POINT2D* cp;
int result = LW_FALSE;
- uint32_t boundary_point_idx = 0;
- double max_norm = -DBL_MAX;
- double curr_norm;
/* An array of objects to be analyzed.
- * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
+ * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
POINT2D** objs;
/* An array of centers for the algorithm. */
@@ -152,7 +264,10 @@
assert(geoms);
if (n < k)
- lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
+ {
+ lwerror("%s: number of geometries is less than the number of clusters requested, not all clusters will get data", __func__);
+ k = n;
+ }
/* We'll hold the temporary centroid objects here */
centroids = lwalloc(sizeof(LWGEOM*) * n);
@@ -173,9 +288,6 @@
memset(clusters, 0, sizeof(int) * n);
memset(centers, 0, sizeof(POINT2D*) * k);
- /* Array of sums of distances to a point from accepted cluster centers */
- distances = lwalloc(sizeof(double) * n);
-
/* Prepare the list of object pointers for K-means */
for (i = 0; i < n; i++)
{
@@ -182,25 +294,18 @@
const LWGEOM* geom = geoms[i];
LWPOINT* lwpoint;
- /* Null/empty geometries get a NULL pointer */
- if ((!geom) || lwgeom_is_empty(geom))
- {
- objs[i] = NULL;
- /* mark as unreachable */
- distances[i] = -1;
- continue;
- }
+ /* Null/empty geometries get a NULL pointer, set earlier with memset */
+ if ((!geom) || lwgeom_is_empty(geom)) continue;
- distances[i] = DBL_MAX;
-
/* If the input is a point, use its coordinates */
/* If its not a point, convert it to one via centroid */
if (lwgeom_get_type(geom) != POINTTYPE)
{
LWGEOM* centroid = lwgeom_centroid(geom);
- if ((!centroid) || lwgeom_is_empty(centroid))
+ if ((!centroid)) continue;
+ if (lwgeom_is_empty(centroid))
{
- objs[i] = NULL;
+ lwgeom_free(centroid);
continue;
}
centroids[num_centroids++] = centroid;
@@ -207,71 +312,22 @@
lwpoint = lwgeom_as_lwpoint(centroid);
}
else
- {
lwpoint = lwgeom_as_lwpoint(geom);
- }
/* Store a pointer to the POINT2D we are interested in */
cp = getPoint2d_cp(lwpoint->point, 0);
objs[i] = (POINT2D*)cp;
-
- /* Find the point with largest Euclidean norm to use as seed */
- curr_norm = (cp->x) * (cp->x) + (cp->y) * (cp->y);
- if (curr_norm > max_norm)
- {
- boundary_point_idx = i;
- max_norm = curr_norm;
- }
+ num_non_empty++;
}
- if (max_norm == -DBL_MAX)
+ if (num_non_empty < k)
{
- lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
+ lwnotice("%s: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data", __func__);
+ k = num_non_empty;
}
- /* start with point on boundary */
- distances[boundary_point_idx] = -1;
- centers_raw[0] = *((POINT2D*)objs[boundary_point_idx]);
- centers[0] = &(centers_raw[0]);
- /* loop i on clusters, skip 0 as it's found already */
- for (i = 1; i < k; i++)
- {
- uint32_t j;
- double max_distance = -DBL_MAX;
- double curr_distance;
- uint32_t candidate_center = 0;
+ kmeans_init(objs, clusters, n, centers, centers_raw, k);
- /* loop j on objs */
- for (j = 0; j < n; j++)
- {
- /* empty objs and accepted clusters are already marked with distance = -1 */
- if (distances[j] < 0)
- continue;
-
- /* update distance to closest cluster */
- curr_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
- distances[j] = fmin(curr_distance, distances[j]);
-
- /* greedily take a point that's farthest from any of accepted clusters */
- if (distances[j] > max_distance)
- {
- candidate_center = j;
- max_distance = distances[j];
- }
- }
-
- /* something is wrong with data, cannot find a candidate */
- if (max_distance < 0)
- lwerror("unable to calculate cluster seed points, too many NULLs or empties?");
-
- /* accept candidtate to centers */
- distances[candidate_center] = -1;
- /* Copy the point coordinates into the initial centers array
- * This is ugly, but the centers array is an array of pointers to points, not an array of points */
- centers_raw[i] = *((POINT2D*)objs[candidate_center]);
- centers[i] = &(centers_raw[i]);
- }
-
result = kmeans(objs, clusters, n, centers, k);
/* Before error handling, might as well clean up all the inputs */
@@ -279,13 +335,11 @@
lwfree(centers);
lwfree(centers_raw);
lwfree(centroids);
- lwfree(distances);
/* Good result */
- if (result)
- return clusters;
+ if (result) return clusters;
/* Bad result, not going to need the answer */
lwfree(clusters);
return NULL;
-}
\ No newline at end of file
+}
Modified: trunk/liblwgeom/measures.c
===================================================================
--- trunk/liblwgeom/measures.c 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/liblwgeom/measures.c 2018-04-21 12:36:51 UTC (rev 16548)
@@ -2308,35 +2308,24 @@
End of Functions in common for Brute force and new calculation
--------------------------------------------------------------------------------------------------------------*/
-
-/**
-The old function necessary for ptarray_segmentize2d in ptarray.c
-*/
-double
+inline double
distance2d_pt_pt(const POINT2D *p1, const POINT2D *p2)
{
double hside = p2->x - p1->x;
double vside = p2->y - p1->y;
- return sqrt ( hside*hside + vside*vside );
-
+ return hypot(hside, vside);
}
-double
+inline double
distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
{
double hside = p2->x - p1->x;
double vside = p2->y - p1->y;
- return hside*hside + vside*vside;
-
+ return hside * hside + vside * vside;
}
-
-/**
-
-The old function necessary for ptarray_segmentize2d in ptarray.c
-*/
double
distance2d_pt_seg(const POINT2D *p, const POINT2D *A, const POINT2D *B)
{
Modified: trunk/regress/lwgeom_regress.sql
===================================================================
--- trunk/regress/lwgeom_regress.sql 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/lwgeom_regress.sql 2018-04-21 12:36:51 UTC (rev 16548)
@@ -201,6 +201,14 @@
order by count(*)
limit 1;
+-- check that null and empty is handled in the clustering
+select '#4071', count(distinct a), count(distinct b), count(distinct c) from
+(select
+ ST_ClusterKMeans(geom, 1) over () a,
+ ST_ClusterKMeans(geom, 2) over () b,
+ ST_ClusterKMeans(geom, 3) over () c
+from (values (null::geometry), ('POINT(1 1)'), ('POINT EMPTY'), ('POINT(0 0)'), ('POINT(4 4)')) as g (geom)) z;
+
-- typmod checks
select 'typmod_point_4326', geometry_typmod_out(geometry_typmod_in('{Point,4326}'));
select 'typmod_point_0', geometry_typmod_out(geometry_typmod_in('{Point,0}'));
Modified: trunk/regress/lwgeom_regress_expected
===================================================================
--- trunk/regress/lwgeom_regress_expected 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/lwgeom_regress_expected 2018-04-21 12:36:51 UTC (rev 16548)
@@ -34,6 +34,7 @@
ST_Angle_2_lines|4.71238898038469
#3965|25|25
#3971|t
+#4071|2|3|4
typmod_point_4326|(Point,4326)
typmod_point_0|(Point)
NOTICE: SRID value -1 converted to the officially unknown SRID value 0
Modified: trunk/regress/tickets.sql
===================================================================
--- trunk/regress/tickets.sql 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/tickets.sql 2018-04-21 12:36:51 UTC (rev 16548)
@@ -878,7 +878,7 @@
WITH
input AS (SELECT 'SRID=4326;POLYGON((26426 65078,26531 65242,26075 65136,26096 65427,26426 65078))'::geometry AS geom),
mbc AS (SELECT (mb).center, (mb).radius FROM (SELECT ST_MinimumBoundingRadius(geom) mb FROM input) sq)
-SELECT '#2996', radius = ST_Length(ST_LongestLine(geom, center)) FROM input, mbc;
+SELECT '#2996', radius::numeric(8,2), ST_Length(ST_LongestLine(geom, center))::numeric(8,2) FROM input, mbc;
-- #3119 --
SELECT '#3119a', floor(ST_LengthSpheroid('SRID=4326;LINESTRING (-72.640965 42.11867, -72.6395 42.1187)', 'SPHEROID["GRS_1980",6378137,298.257222101]'));
Modified: trunk/regress/tickets_expected
===================================================================
--- trunk/regress/tickets_expected 2018-04-17 10:20:22 UTC (rev 16547)
+++ trunk/regress/tickets_expected 2018-04-21 12:36:51 UTC (rev 16548)
@@ -270,7 +270,7 @@
#2870|Point[GS]
#2956|t
#2985|LINESTRING(20.9511664253809 52.3984560730436)
-#2996|t
+#2996|247.44|247.44
#3119a|121
#3119b|291
#3119c|615
More information about the postgis-tickets
mailing list