[postgis-tickets] r16228 - Change KMeans init algorithm to one with less skew.
Darafei
komzpa at gmail.com
Tue Jan 9 06:01:59 PST 2018
Author: komzpa
Date: 2018-01-09 06:01:58 -0800 (Tue, 09 Jan 2018)
New Revision: 16228
Modified:
trunk/liblwgeom/lwkmeans.c
trunk/regress/lwgeom_regress.sql
trunk/regress/lwgeom_regress_expected
Log:
Change KMeans init algorithm to one with less skew.
Closes #3971
Closes https://github.com/postgis/postgis/pull/181
Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c 2018-01-09 13:49:40 UTC (rev 16227)
+++ trunk/liblwgeom/lwkmeans.c 2018-01-09 14:01:58 UTC (rev 16228)
@@ -4,7 +4,6 @@
#include "kmeans.h"
#include "liblwgeom_internal.h"
-
static double lwkmeans_pt_distance(const Pointer a, const Pointer b)
{
POINT2D *pa = (POINT2D*)a;
@@ -16,32 +15,6 @@
return dx*dx + dy*dy;
}
-static int lwkmeans_pt_closest(const Pointer * objs, size_t num_objs, const Pointer a)
-{
- int i;
- double d;
- double d_closest = FLT_MAX;
- int closest = -1;
-
- assert(num_objs > 0);
-
- for (i = 0; i < num_objs; i++)
- {
- /* Skip nulls/empties */
- if (!objs[i])
- continue;
-
- d = lwkmeans_pt_distance(objs[i], a);
- if (d < d_closest)
- {
- d_closest = d;
- closest = i;
- }
- }
-
- return closest;
-}
-
static void lwkmeans_pt_centroid(const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid)
{
int i;
@@ -80,24 +53,22 @@
int num_centroids = 0;
LWGEOM **centroids;
POINT2D *centers_raw;
+ double *distances;
const POINT2D *cp;
- POINT2D min = { DBL_MAX, DBL_MAX };
- POINT2D max = { -DBL_MAX, -DBL_MAX };
- double dx, dy;
kmeans_config config;
kmeans_result result;
- int *seen;
- int sidx = 0;
+ int boundary_point_idx = 0;
+ double xmin = DBL_MAX;
assert(k>0);
assert(ngeoms>0);
assert(geoms);
- /* Initialize our static structs */
- memset(&config, 0, sizeof(kmeans_config));
- memset(&result, 0, sizeof(kmeans_result));
+ /* Initialize our static structs */
+ memset(&config, 0, sizeof(kmeans_config));
+ memset(&result, 0, sizeof(kmeans_result));
- if (ngeoms<k)
+ if (ngeoms < k)
{
lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
}
@@ -127,6 +98,10 @@
memset(config.clusters, 0, sizeof(int) * ngeoms);
memset(config.centers, 0, sizeof(Pointer) * k);
+ /* Array of sums of distances to a point from accepted cluster centers */
+ distances = lwalloc(sizeof(double)*config.num_objs);
+ memset(distances, 0, sizeof(double)*config.num_objs);
+
/* Prepare the list of object pointers for K-means */
for (i = 0; i < ngeoms; i++)
{
@@ -137,6 +112,8 @@
if ((!geom) || lwgeom_is_empty(geom))
{
config.objs[i] = NULL;
+ /* mark as unreachable */
+ distances[i] = -1;
continue;
}
@@ -162,58 +139,55 @@
cp = getPoint2d_cp(lwpoint->point, 0);
config.objs[i] = (Pointer)cp;
- /* Since we're already here, let's calculate the extrema of the set */
- if (cp->x < min.x) min.x = cp->x;
- if (cp->y < min.y) min.y = cp->y;
- if (cp->x > max.x) max.x = cp->x;
- if (cp->y > max.y) max.y = cp->y;
+ /* Find the point that's on the boundary to use as seed */
+ if (xmin > cp->x)
+ {
+ boundary_point_idx = i;
+ xmin = cp->x;
+ }
}
- /*
- * We map a uniform assignment of points in the area covered by the set
- * onto actual points in the set
- */
- dx = (max.x - min.x)/k;
- dy = (max.y - min.y)/k;
- seen = lwalloc(sizeof(int)*config.k);
- memset(seen, 0, sizeof(int)*config.k);
- for (i = 0; i < k; i++)
+ if (xmin == DBL_MAX)
{
- int closest;
- POINT2D p;
+ lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
+ }
+
+ /* start with point on boundary */
+ distances[boundary_point_idx] = -1;
+ centers_raw[0] = *((POINT2D*)config.objs[boundary_point_idx]);
+ config.centers[0] = &(centers_raw[0]);
+ /* loop i on clusters, skip 0 as it's found already */
+ for (i = 1; i < k; i++)
+ {
int j;
+ double max_distance = -DBL_MAX;
+ int candidate_center = 0;
- /* Calculate a point in the range */
- p.x = min.x + dx * (i + 0.5);
- p.y = min.y + dy * (i + 0.5);
-
- /* Find the data point closest to the calculated point */
- closest = lwkmeans_pt_closest(config.objs, config.num_objs, &p);
-
- /* If something is terrible wrong w/ data, cannot find a closest */
- if (closest < 0)
- lwerror("unable to calculate cluster seed points, too many NULLs or empties?");
-
- /* Ensure we aren't already using that point as a seed */
- j = 0;
- while(j < sidx)
+ /* loop j on objs */
+ for (j = 0; j < config.num_objs; j++)
{
- if (seen[j] == closest)
+ /* empty objs and accepted clusters are already marked with distance = -1 */
+ if (distances[j] < 0) continue;
+
+ /* greedily take a point that's farthest from all accepted clusters */
+ distances[j] += lwkmeans_pt_distance(&config.objs[j], ¢ers_raw[i-1]);
+ if (distances[j] > max_distance)
{
- closest = (closest + 1) % config.num_objs;
- j = 0;
+ candidate_center = j;
+ max_distance = distances[j];
}
- else
- {
- j++;
- }
}
- seen[sidx++] = closest;
+ /* something is wrong with data, cannot find a candidate */
+ if (max_distance < 0)
+ lwerror("unable to calculate cluster seed points, too many NULLs or empties?");
+
+ /* accept candidtate to centers */
+ distances[candidate_center] = -1;
/* Copy the point coordinates into the initial centers array */
/* This is ugly, but the centers array is an array of */
/* pointers to points, not an array of points */
- centers_raw[i] = *((POINT2D*)config.objs[closest]);
+ centers_raw[i] = *((POINT2D*)config.objs[candidate_center]);
config.centers[i] = &(centers_raw[i]);
}
@@ -224,7 +198,7 @@
lwfree(config.centers);
lwfree(centers_raw);
lwfree(centroids);
- lwfree(seen);
+ lwfree(distances);
/* Good result */
if (result == KMEANS_OK)
@@ -241,4 +215,3 @@
/* Unknown error */
return NULL;
}
-
Modified: trunk/regress/lwgeom_regress.sql
===================================================================
--- trunk/regress/lwgeom_regress.sql 2018-01-09 13:49:40 UTC (rev 16227)
+++ trunk/regress/lwgeom_regress.sql 2018-01-09 14:01:58 UTC (rev 16228)
@@ -171,8 +171,38 @@
, ST_GeomFromtext('LINESTRING(1 0, 2 0)') AS l2 ;
--- ST_ClusterKMeans
-select '#3965', count(distinct cid), count(*) from (
- with points as (select ST_MakePoint(x,y) geom from generate_series(1,5) x, generate_series(1,5) y)
- select ST_ClusterKMeans(geom, 25) over () as cid, geom
- from points) z;
+-- check we have not less than k clusters
+select
+ '#3965',
+ count(distinct cid),
+ count(*)
+from (
+ with points as (
+ select ST_MakePoint(x, y) geom
+ from generate_series(1, 5) x,
+ generate_series(1, 5) y
+ )
+ select
+ ST_ClusterKMeans(geom, 25)
+ over () as cid,
+ geom
+ from points) z;
+
+-- check that grid gets clustered to clusters of similar size
+select '#3971', count(*) >= 12 -- in perfect match it's 25, #3971 increases it to 12 from 4
+from (
+ with
+ points as (
+ select ST_MakePoint(x, y) geom
+ from generate_series(1, 45) x, generate_series(1, 45) y
+ )
+ select
+ ST_ClusterKMeans(geom, 81)
+ over () as cid,
+ geom
+ from points
+ ) z
+group by cid
+order by count(*)
+limit 1;
Modified: trunk/regress/lwgeom_regress_expected
===================================================================
--- trunk/regress/lwgeom_regress_expected 2018-01-09 13:49:40 UTC (rev 16227)
+++ trunk/regress/lwgeom_regress_expected 2018-01-09 14:01:58 UTC (rev 16228)
@@ -33,3 +33,4 @@
ERROR: Empty geometry
ST_Angle_2_lines|4.71238898038469
#3965|25|25
+#3971|t
More information about the postgis-tickets
mailing list