[postgis-tickets] r16251 - Simplify KMeans code.
Darafei
komzpa at gmail.com
Wed Jan 10 11:27:49 PST 2018
Author: komzpa
Date: 2018-01-10 11:27:48 -0800 (Wed, 10 Jan 2018)
New Revision: 16251
Removed:
trunk/liblwgeom/kmeans.c
trunk/liblwgeom/kmeans.h
Modified:
trunk/NEWS
trunk/liblwgeom/Makefile.in
trunk/liblwgeom/lwkmeans.c
trunk/regress/lwgeom_regress.sql
Log:
Simplify KMeans code.
Remove threaded implementation.
Rewrite generic implementation to be POINT2D specific.
Closes #3977
Closes https://github.com/postgis/postgis/pull/186
Modified: trunk/NEWS
===================================================================
--- trunk/NEWS 2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/NEWS 2018-01-10 19:27:48 UTC (rev 16251)
@@ -32,6 +32,7 @@
- #3965, ST_ClusterKMeans used to lose some clusters on initialization
(Darafei Praliaskouski)
- #3971, ST_ClusterKMeans now uses better initial seed (Darafei Praliaskouski)
+ - #3977, ST_ClusterKMeans is now faster and simpler (Darafei Praliaskouski)
PostGIS 2.4.0
2017/09/30
Modified: trunk/liblwgeom/Makefile.in
===================================================================
--- trunk/liblwgeom/Makefile.in 2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/Makefile.in 2018-01-10 19:27:48 UTC (rev 16251)
@@ -115,7 +115,6 @@
lwunionfind.o \
effectivearea.o \
lwkmeans.o \
- kmeans.o \
varint.o
NM_OBJS = \
@@ -179,9 +178,6 @@
../postgis_svn_revision.h:
$(MAKE) -C .. postgis_svn_revision.h
-#liblwgeom.a: $(SA_OBJS) $(NM_OBJS) $(SA_HEADERS)
-#ar rs liblwgeom.a $(SA_OBJS) $(NM_OBJS)
-
liblwgeom.la: $(LT_OBJS)
$(LIBTOOL) --tag=CC --mode=link $(CC) -rpath $(libdir) $(LT_OBJS) \
-release $(SOVER) -version-info $(VERSION_INFO) $(LDFLAGS) -o $@
Deleted: trunk/liblwgeom/kmeans.c
===================================================================
--- trunk/liblwgeom/kmeans.c 2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/kmeans.c 2018-01-10 19:27:48 UTC (rev 16251)
@@ -1,317 +0,0 @@
-/*-------------------------------------------------------------------------
-*
-* kmeans.c
-* Generic k-means implementation
-*
-* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
-*
-*------------------------------------------------------------------------*/
-
-#include <assert.h>
-#include <float.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "kmeans.h"
-
-#ifdef KMEANS_THREADED
-#include <pthread.h>
-#endif
-
-static void
-update_r(kmeans_config *config)
-{
- int i;
-
- for (i = 0; i < config->num_objs; i++)
- {
- double distance, curr_distance;
- int cluster, curr_cluster;
- Pointer obj;
-
- assert(config->objs != NULL);
- assert(config->num_objs > 0);
- assert(config->centers);
- assert(config->clusters);
-
- obj = config->objs[i];
-
- /*
- * Don't try to cluster NULL objects, just add them
- * to the "unclusterable cluster"
- */
- if (!obj)
- {
- config->clusters[i] = KMEANS_NULL_CLUSTER;
- continue;
- }
-
- /* Initialize with distance to first cluster */
- curr_distance = (config->distance_method)(obj, config->centers[0]);
- curr_cluster = 0;
-
- /* Check all other cluster centers and find the nearest */
- for (cluster = 1; cluster < config->k; cluster++)
- {
- distance = (config->distance_method)(obj, config->centers[cluster]);
- if (distance < curr_distance)
- {
- curr_distance = distance;
- curr_cluster = cluster;
- }
- }
-
- /* Store the nearest cluster this object is in */
- config->clusters[i] = curr_cluster;
- }
-}
-
-static void
-update_means(kmeans_config *config)
-{
- int i;
-
- for (i = 0; i < config->k; i++)
- {
- /* Update the centroid for this cluster */
- (config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
- }
-}
-
-#ifdef KMEANS_THREADED
-
-static void * update_r_threaded_main(void *args)
-{
- kmeans_config *config = (kmeans_config*)args;
- update_r(config);
- pthread_exit(args);
-}
-
-static void update_r_threaded(kmeans_config *config)
-{
- /* Computational complexity is function of objs/clusters */
- /* We only spin up threading infra if we need more than one core */
- /* running. We keep the threshold high so the overhead of */
- /* thread management is small compared to thread compute time */
- int num_threads = config->num_objs * config->k / KMEANS_THR_THRESHOLD;
-
- /* Can't run more threads than the maximum */
- num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
-
- /* If the problem size is small, don't bother w/ threading */
- if (num_threads < 1)
- {
- update_r(config);
- }
- else
- {
- pthread_t thread[KMEANS_THR_MAX];
- pthread_attr_t thread_attr;
- kmeans_config thread_config[KMEANS_THR_MAX];
- int obs_per_thread = config->num_objs / num_threads;
- int i, rc;
-
- for (i = 0; i < num_threads; i++)
- {
- /*
- * Each thread gets a copy of the config, but with the list pointers
- * offset to the start of the batch the thread is responsible for, and the
- * object count number adjusted similarly.
- */
- memcpy(&(thread_config[i]), config, sizeof(kmeans_config));
- thread_config[i].objs += i*obs_per_thread;
- thread_config[i].clusters += i*obs_per_thread;
- thread_config[i].num_objs = obs_per_thread;
- if (i == num_threads-1)
- {
- thread_config[i].num_objs += config->num_objs - num_threads*obs_per_thread;
- }
-
- /* Initialize and set thread detached attribute */
- pthread_attr_init(&thread_attr);
- pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
-
- /* Now we just run the thread, on its subset of the data */
- rc = pthread_create(&thread[i], &thread_attr, update_r_threaded_main, (void *) &thread_config[i]);
- if (rc)
- {
- printf("ERROR: return code from pthread_create() is %d\n", rc);
- exit(-1);
- }
- }
-
- /* Free attribute and wait for the other threads */
- pthread_attr_destroy(&thread_attr);
-
- /* Wait for all calculations to complete */
- for (i = 0; i < num_threads; i++)
- {
- void *status;
- rc = pthread_join(thread[i], &status);
- if (rc)
- {
- printf("ERROR: return code from pthread_join() is %d\n", rc);
- exit(-1);
- }
- }
- }
-}
-
-int update_means_k;
-pthread_mutex_t update_means_k_mutex;
-
-static void *
-update_means_threaded_main(void *arg)
-{
- kmeans_config *config = (kmeans_config*)arg;
- int i = 0;
-
- do
- {
- pthread_mutex_lock (&update_means_k_mutex);
- i = update_means_k;
- update_means_k++;
- pthread_mutex_unlock (&update_means_k_mutex);
-
- if (i < config->k)
- (config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
- }
- while (i < config->k);
-
- pthread_exit(arg);
-}
-
-static void
-update_means_threaded(kmeans_config *config)
-{
- /* We only spin up threading infra if we need more than one core */
- /* running. We keep the threshold high so the overhead of */
- /* thread management is small compared to thread compute time */
- int num_threads = config->num_objs / KMEANS_THR_THRESHOLD;
-
- /* Can't run more threads than the maximum */
- num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
-
- /* If the problem size is small, don't bother w/ threading */
- if (num_threads < 1)
- {
- update_means(config);
- }
- else
- {
- /* Mutex protected counter to drive threads */
- pthread_t thread[KMEANS_THR_MAX];
- pthread_attr_t thread_attr;
- int i, rc;
-
- pthread_mutex_init(&update_means_k_mutex, NULL);
- update_means_k = 0;
-
- pthread_attr_init(&thread_attr);
- pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
-
- /* Create threads to perform computation */
- for (i = 0; i < num_threads; i++)
- {
-
- /* Now we just run the thread, on its subset of the data */
- rc = pthread_create(&thread[i], &thread_attr, update_means_threaded_main, (void *) config);
- if (rc)
- {
- printf("ERROR: return code from pthread_create() is %d\n", rc);
- exit(-1);
- }
- }
-
- pthread_attr_destroy(&thread_attr);
-
- /* Watch until completion */
- for (i = 0; i < num_threads; i++)
- {
- void *status;
- rc = pthread_join(thread[i], &status);
- if (rc)
- {
- printf("ERROR: return code from pthread_join() is %d\n", rc);
- exit(-1);
- }
- }
-
- pthread_mutex_destroy(&update_means_k_mutex);
- }
-}
-
-#endif /* KMEANS_THREADED */
-
-kmeans_result
-kmeans(kmeans_config *config)
-{
- int iterations = 0;
- int *clusters_last;
- size_t clusters_sz = sizeof(int)*config->num_objs;
-
- assert(config);
- assert(config->objs);
- assert(config->num_objs);
- assert(config->distance_method);
- assert(config->centroid_method);
- assert(config->centers);
- assert(config->k);
- assert(config->clusters);
- assert(config->k <= config->num_objs);
-
- /* Zero out cluster numbers, just in case user forgets */
- memset(config->clusters, 0, clusters_sz);
-
- /* Set default max iterations if necessary */
- if (!config->max_iterations)
- config->max_iterations = KMEANS_MAX_ITERATIONS;
-
- /*
- * Previous cluster state array. At this time, r doesn't mean anything
- * but it's ok
- */
- clusters_last = kmeans_malloc(clusters_sz);
-
- while (1)
- {
- LW_ON_INTERRUPT(kmeans_free(clusters_last); return KMEANS_ERROR);
-
- /* Store the previous state of the clustering */
- memcpy(clusters_last, config->clusters, clusters_sz);
-
-#ifdef KMEANS_THREADED
- update_r_threaded(config);
- update_means_threaded(config);
-#else
- update_r(config);
- update_means(config);
-#endif
- /*
- * if all the cluster numbers are unchanged since last time,
- * we are at a stable solution, so we can stop here
- */
- if (memcmp(clusters_last, config->clusters, clusters_sz) == 0)
- {
- kmeans_free(clusters_last);
- config->total_iterations = iterations;
- return KMEANS_OK;
- }
-
- if (iterations++ > config->max_iterations)
- {
- kmeans_free(clusters_last);
- config->total_iterations = iterations;
- return KMEANS_EXCEEDED_MAX_ITERATIONS;
- }
-
- }
-
- kmeans_free(clusters_last);
- config->total_iterations = iterations;
- return KMEANS_ERROR;
-}
-
-
Deleted: trunk/liblwgeom/kmeans.h
===================================================================
--- trunk/liblwgeom/kmeans.h 2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/kmeans.h 2018-01-10 19:27:48 UTC (rev 16251)
@@ -1,126 +0,0 @@
-/*-------------------------------------------------------------------------
-*
-* kmeans.h
-* Generic k-means implementation
-*
-* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
-*
-*------------------------------------------------------------------------*/
-
-
-#include <stdlib.h>
-#include "liblwgeom_internal.h"
-
-/*
-* Simple k-means implementation for arbitrary data structures
-*
-* Since k-means partitions based on inter-object "distance" the same
-* machinery can be used to support any object type that can calculate a
-* "distance" between pairs.
-*
-* To use the k-means infrastructure, just fill out the kmeans_config
-* structure and invoke the kmeans() function.
-*/
-
-/*
-* Threaded calculation is available using pthreads, which practically
-* means UNIX platforms only, unless you're building with a posix
-* compatible environment.
-*
-* #define KMEANS_THREADED
-*/
-
-/*
-* When clustering lists with NULL elements, they will get this as
-* their cluster number. (All the other clusters will be non-negative)
-*/
-#define KMEANS_NULL_CLUSTER -1
-
-/*
-* If the algorithm doesn't converge within this number of iterations,
-* it will return with a failure error code.
-*/
-#define KMEANS_MAX_ITERATIONS 1000
-
-/*
-* The code doesn't try to figure out how many threads to use, so
-* best to set this to the number of cores you expect to have
-* available. The threshold is the the value of k*n at which to
-* move to multi-threading.
-*/
-#ifdef KMEANS_THREADED
-#define KMEANS_THR_MAX 4
-#define KMEANS_THR_THRESHOLD 250000
-#endif
-
-#define kmeans_malloc(size) lwalloc(size)
-#define kmeans_free(ptr) lwfree(ptr)
-
-typedef void * Pointer;
-
-typedef enum {
- KMEANS_OK,
- KMEANS_EXCEEDED_MAX_ITERATIONS,
- KMEANS_ERROR
-} kmeans_result;
-
-/*
-* Prototype for the distance calculating function
-*/
-typedef double (*kmeans_distance_method) (const Pointer a, const Pointer b);
-
-/*
-* Prototype for the centroid calculating function
-* @param objs the list of all objects in the calculation
-* @param clusters the list of cluster numbers for each object
-* @param num_objs the number of objects/cluster numbers in the previous arrays
-* @param cluster the cluster number we are actually generating a centroid for here
-* @param centroid the object to write the centroid result into (already allocated)
-*/
-typedef void (*kmeans_centroid_method) (const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid);
-
-typedef struct kmeans_config
-{
- /* Function returns the "distance" between any pair of objects */
- kmeans_distance_method distance_method;
-
- /* Function returns the "centroid" of a collection of objects */
- kmeans_centroid_method centroid_method;
-
- /* An array of objects to be analyzed. User allocates this array */
- /* and is responsible for freeing it. */
- /* For objects that are not capable of participating in the distance */
- /* calculations, but for which you still want included in the process */
- /* (for examples, database nulls, or geometry empties) use a NULL */
- /* value in this list. All NULL values will be returned in the */
- /* KMEANS_NULL_CLUSTER. */
- Pointer * objs;
-
- /* Number of objects in the preceding array */
- size_t num_objs;
-
- /* An array of initial centers for the algorithm */
- /* Can be randomly assigned, or using proportions, */
- /* unfortunately the algorithm is sensitive to starting */
- /* points, so using a "better" set of starting points */
- /* might be wise. User allocates and is responsible for freeing. */
- Pointer * centers;
-
- /* Number of means we are calculating, length of preceding array */
- unsigned int k;
-
- /* Maximum number of times to iterate the algorithm, or 0 for */
- /* library default */
- unsigned int max_iterations;
-
- /* Iteration counter */
- unsigned int total_iterations;
-
- /* Array to fill in with cluster numbers. User allocates and frees. */
- int * clusters;
-
-} kmeans_config;
-
-/* This is where the magic happens. */
-kmeans_result kmeans(kmeans_config *config);
-
Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c 2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/lwkmeans.c 2018-01-10 19:27:48 UTC (rev 16251)
@@ -1,130 +1,206 @@
+/*-------------------------------------------------------------------------
+ *
+ * Copyright (c) 2018, Darafei Praliaskouski <me at komzpa.net>
+ * Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
+ *
+ *------------------------------------------------------------------------*/
+
#include <float.h>
#include <math.h>
-#include "kmeans.h"
#include "liblwgeom_internal.h"
-static double lwkmeans_pt_distance(const Pointer a, const Pointer b)
+/*
+ * When clustering lists with NULL elements, they will get this as
+ * their cluster number. (All the other clusters will be non-negative)
+ */
+#define KMEANS_NULL_CLUSTER -1
+
+/*
+ * If the algorithm doesn't converge within this number of iterations,
+ * it will return with a failure error code.
+ */
+#define KMEANS_MAX_ITERATIONS 1000
+
+static void
+update_r(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, unsigned int k)
{
- POINT2D *pa = (POINT2D*)a;
- POINT2D *pb = (POINT2D*)b;
+ POINT2D* obj;
+ unsigned int i;
+ double distance, curr_distance;
+ int cluster, curr_cluster;
- double dx = (pa->x - pb->x);
- double dy = (pa->y - pb->y);
+ for (i = 0; i < n; i++)
+ {
+ obj = objs[i];
- return dx*dx + dy*dy;
+ /* Don't try to cluster NULL objects, just add them to the "unclusterable cluster" */
+ if (!obj)
+ {
+ clusters[i] = KMEANS_NULL_CLUSTER;
+ continue;
+ }
+
+ /* Initialize with distance to first cluster */
+ curr_distance = distance2d_sqr_pt_pt(obj, centers[0]);
+ curr_cluster = 0;
+
+ /* Check all other cluster centers and find the nearest */
+ for (cluster = 1; cluster < k; cluster++)
+ {
+ distance = distance2d_sqr_pt_pt(obj, centers[cluster]);
+ if (distance < curr_distance)
+ {
+ curr_distance = distance;
+ curr_cluster = cluster;
+ }
+ }
+
+ /* Store the nearest cluster this object is in */
+ clusters[i] = curr_cluster;
+ }
}
-static void lwkmeans_pt_centroid(const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid)
+static void
+update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, unsigned int* weights, unsigned int k)
{
- int i;
- int num_cluster = 0;
- POINT2D sum;
- POINT2D **pts = (POINT2D**)objs;
- POINT2D *center = (POINT2D*)centroid;
+ unsigned int i;
- sum.x = sum.y = 0.0;
-
- if (num_objs <= 0) return;
-
- for (i = 0; i < num_objs; i++)
+ memset(weights, 0, sizeof(int) * k);
+ for (i = 0; i < k; i++)
{
- /* Skip points that are not of interest */
- if (clusters[i] != cluster) continue;
-
- sum.x += pts[i]->x;
- sum.y += pts[i]->y;
- num_cluster++;
+ centers[i]->x = 0.0;
+ centers[i]->y = 0.0;
}
- if (num_cluster)
+ for (i = 0; i < n; i++)
{
- sum.x /= num_cluster;
- sum.y /= num_cluster;
- *center = sum;
+ centers[clusters[i]]->x += objs[i]->x;
+ centers[clusters[i]]->y += objs[i]->y;
+ weights[clusters[i]] += 1;
}
- return;
+ for (i = 0; i < k; i++)
+ {
+ centers[i]->x /= weights[i];
+ centers[i]->y /= weights[i];
+ }
}
-
-int *
-lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, int ngeoms, int k)
+static int
+kmeans(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, unsigned int k)
{
- int i;
- int num_centroids = 0;
- LWGEOM **centroids;
- POINT2D *centers_raw;
- double *distances;
- const POINT2D *cp;
- kmeans_config config;
- kmeans_result result;
- int boundary_point_idx = 0;
- double xmin = DBL_MAX;
+ unsigned int i = 0;
+ int* clusters_last;
+ int converged = LW_FALSE;
+ uint32_t clusters_sz = sizeof(int) * n;
+ unsigned int* weights;
- assert(k>0);
- assert(ngeoms>0);
- assert(geoms);
+ weights = lwalloc(sizeof(int) * k);
- /* Initialize our static structs */
- memset(&config, 0, sizeof(kmeans_config));
- memset(&result, 0, sizeof(kmeans_result));
+ /*
+ * Previous cluster state array. At this time, r doesn't mean anything
+ * but it's ok
+ */
+ clusters_last = lwalloc(clusters_sz);
- if (ngeoms < k)
+ for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
{
- lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
+ LW_ON_INTERRUPT(break);
+
+ /* Store the previous state of the clustering */
+ memcpy(clusters_last, clusters, clusters_sz);
+
+ update_r(objs, clusters, n, centers, k);
+ update_means(objs, clusters, n, centers, weights, k);
+
+ /* if all the cluster numbers are unchanged, we are at a stable solution */
+ converged = memcmp(clusters_last, clusters, clusters_sz) == 0;
}
+ lwfree(clusters_last);
+ lwfree(weights);
+ if (!converged)
+ lwerror("%s did not converge after %d iterations", __func__, i);
+ return converged;
+}
+
+int*
+lwgeom_cluster_2d_kmeans(const LWGEOM** geoms, int n, int k)
+{
+ unsigned int i;
+ unsigned int num_centroids = 0;
+ LWGEOM** centroids;
+ POINT2D* centers_raw;
+ double* distances;
+ const POINT2D* cp;
+ int result = LW_FALSE;
+ unsigned int boundary_point_idx = 0;
+ double max_norm = -DBL_MAX;
+ double curr_norm;
+
+ /* An array of objects to be analyzed.
+ * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
+ POINT2D** objs;
+
+ /* An array of centers for the algorithm. */
+ POINT2D** centers;
+
+ /* Array to fill in with cluster numbers. */
+ int* clusters;
+
+ assert(k > 0);
+ assert(n > 0);
+ assert(geoms);
+
+ if (n < k)
+ lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
+
/* We'll hold the temporary centroid objects here */
- centroids = lwalloc(sizeof(LWGEOM*) * ngeoms);
- memset(centroids, 0, sizeof(LWGEOM*) * ngeoms);
+ centroids = lwalloc(sizeof(LWGEOM*) * n);
+ memset(centroids, 0, sizeof(LWGEOM*) * n);
- /* The vector of cluster means. We have to allocate a */
- /* chunk of memory for these because we'll be mutating them */
- /* in the kmeans algorithm */
+ /* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
+ * in the kmeans algorithm */
centers_raw = lwalloc(sizeof(POINT2D) * k);
memset(centers_raw, 0, sizeof(POINT2D) * k);
/* K-means configuration setup */
- config.objs = lwalloc(sizeof(Pointer) * ngeoms);
- config.num_objs = ngeoms;
- config.clusters = lwalloc(sizeof(int) * ngeoms);
- config.centers = lwalloc(sizeof(Pointer) * k);
- config.k = k;
- config.max_iterations = 0;
- config.distance_method = lwkmeans_pt_distance;
- config.centroid_method = lwkmeans_pt_centroid;
+ objs = lwalloc(sizeof(POINT2D*) * n);
+ clusters = lwalloc(sizeof(int) * n);
+ centers = lwalloc(sizeof(POINT2D*) * k);
/* Clean the memory */
- memset(config.objs, 0, sizeof(Pointer) * ngeoms);
- memset(config.clusters, 0, sizeof(int) * ngeoms);
- memset(config.centers, 0, sizeof(Pointer) * k);
+ memset(objs, 0, sizeof(POINT2D*) * n);
+ memset(clusters, 0, sizeof(int) * n);
+ memset(centers, 0, sizeof(POINT2D*) * k);
/* Array of sums of distances to a point from accepted cluster centers */
- distances = lwalloc(sizeof(double)*config.num_objs);
- memset(distances, 0, sizeof(double)*config.num_objs);
+ distances = lwalloc(sizeof(double) * n);
/* Prepare the list of object pointers for K-means */
- for (i = 0; i < ngeoms; i++)
+ for (i = 0; i < n; i++)
{
- const LWGEOM *geom = geoms[i];
- LWPOINT *lwpoint;
+ const LWGEOM* geom = geoms[i];
+ LWPOINT* lwpoint;
/* Null/empty geometries get a NULL pointer */
if ((!geom) || lwgeom_is_empty(geom))
{
- config.objs[i] = NULL;
+ objs[i] = NULL;
/* mark as unreachable */
distances[i] = -1;
continue;
}
+ distances[i] = DBL_MAX;
+
/* If the input is a point, use its coordinates */
/* If its not a point, convert it to one via centroid */
if (lwgeom_get_type(geom) != POINTTYPE)
{
- LWGEOM *centroid = lwgeom_centroid(geom);
+ LWGEOM* centroid = lwgeom_centroid(geom);
if ((!centroid) || lwgeom_is_empty(centroid))
{
- config.objs[i] = NULL;
+ objs[i] = NULL;
continue;
}
centroids[num_centroids++] = centroid;
@@ -137,40 +213,46 @@
/* Store a pointer to the POINT2D we are interested in */
cp = getPoint2d_cp(lwpoint->point, 0);
- config.objs[i] = (Pointer)cp;
+ objs[i] = (POINT2D*)cp;
- /* Find the point that's on the boundary to use as seed */
- if (xmin > cp->x)
+ /* Find the point with largest Euclidean norm to use as seed */
+ curr_norm = (cp->x) * (cp->x) + (cp->y) * (cp->y);
+ if (curr_norm > max_norm)
{
boundary_point_idx = i;
- xmin = cp->x;
+ max_norm = curr_norm;
}
}
- if (xmin == DBL_MAX)
+ if (max_norm == -DBL_MAX)
{
lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
}
/* start with point on boundary */
distances[boundary_point_idx] = -1;
- centers_raw[0] = *((POINT2D*)config.objs[boundary_point_idx]);
- config.centers[0] = &(centers_raw[0]);
+ centers_raw[0] = *((POINT2D*)objs[boundary_point_idx]);
+ centers[0] = &(centers_raw[0]);
/* loop i on clusters, skip 0 as it's found already */
for (i = 1; i < k; i++)
{
- int j;
+ unsigned int j;
double max_distance = -DBL_MAX;
- int candidate_center = 0;
+ double curr_distance;
+ unsigned int candidate_center = 0;
/* loop j on objs */
- for (j = 0; j < config.num_objs; j++)
+ for (j = 0; j < n; j++)
{
/* empty objs and accepted clusters are already marked with distance = -1 */
- if (distances[j] < 0) continue;
+ if (distances[j] < 0)
+ continue;
- /* greedily take a point that's farthest from all accepted clusters */
- distances[j] += lwkmeans_pt_distance(&config.objs[j], ¢ers_raw[i-1]);
+ /* update distance to closest cluster */
+ curr_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
+ distances[j] = fmin(curr_distance, distances[j]);
+
+ /* greedily take a point that's farthest from any of accepted clusters */
if (distances[j] > max_distance)
{
candidate_center = j;
@@ -184,34 +266,26 @@
/* accept candidtate to centers */
distances[candidate_center] = -1;
- /* Copy the point coordinates into the initial centers array */
- /* This is ugly, but the centers array is an array of */
- /* pointers to points, not an array of points */
- centers_raw[i] = *((POINT2D*)config.objs[candidate_center]);
- config.centers[i] = &(centers_raw[i]);
+ /* Copy the point coordinates into the initial centers array
+ * This is ugly, but the centers array is an array of pointers to points, not an array of points */
+ centers_raw[i] = *((POINT2D*)objs[candidate_center]);
+ centers[i] = &(centers_raw[i]);
}
- result = kmeans(&config);
+ result = kmeans(objs, clusters, n, centers, k);
/* Before error handling, might as well clean up all the inputs */
- lwfree(config.objs);
- lwfree(config.centers);
+ lwfree(objs);
+ lwfree(centers);
lwfree(centers_raw);
lwfree(centroids);
lwfree(distances);
/* Good result */
- if (result == KMEANS_OK)
- return config.clusters;
+ if (result)
+ return clusters;
/* Bad result, not going to need the answer */
- lwfree(config.clusters);
- if (result == KMEANS_EXCEEDED_MAX_ITERATIONS)
- {
- lwerror("%s did not converge after %d iterations", __func__, config.max_iterations);
- return NULL;
- }
-
- /* Unknown error */
+ lwfree(clusters);
return NULL;
-}
+}
\ No newline at end of file
Modified: trunk/regress/lwgeom_regress.sql
===================================================================
--- trunk/regress/lwgeom_regress.sql 2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/regress/lwgeom_regress.sql 2018-01-10 19:27:48 UTC (rev 16251)
@@ -5,9 +5,6 @@
wkb_ndr text
);
-
-
-
INSERT INTO test_data VALUES
(1, 'MULTIPOINT(1 2)', '00000000040000000100000000013FF00000000000004000000000000000', '0104000000010000000101000000000000000000F03F0000000000000040'),
(-5, 'LINESTRING(1 2,3 4)', '0000000002000000023FF0000000000000400000000000000040080000000000004010000000000000', '010200000002000000000000000000F03F000000000000004000000000000008400000000000001040'),
@@ -190,7 +187,7 @@
from points) z;
-- check that grid gets clustered to clusters of similar size
-select '#3971', count(*) >= 12 -- in perfect match it's 25, #3971 increases it to 12 from 4
+select '#3971', count(*) between 16 and 25 -- in perfect match it's 25, better kmeans init can improve
from (
with
points as (
More information about the postgis-tickets
mailing list