[postgis-tickets] r16251 - Simplify KMeans code.

Darafei komzpa at gmail.com
Wed Jan 10 11:27:49 PST 2018


Author: komzpa
Date: 2018-01-10 11:27:48 -0800 (Wed, 10 Jan 2018)
New Revision: 16251

Removed:
   trunk/liblwgeom/kmeans.c
   trunk/liblwgeom/kmeans.h
Modified:
   trunk/NEWS
   trunk/liblwgeom/Makefile.in
   trunk/liblwgeom/lwkmeans.c
   trunk/regress/lwgeom_regress.sql
Log:
Simplify KMeans code.

Remove threaded implementation.
Rewrite generic implementation to be POINT2D specific.

Closes #3977
Closes https://github.com/postgis/postgis/pull/186


Modified: trunk/NEWS
===================================================================
--- trunk/NEWS	2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/NEWS	2018-01-10 19:27:48 UTC (rev 16251)
@@ -32,6 +32,7 @@
   - #3965, ST_ClusterKMeans used to lose some clusters on initialization
            (Darafei Praliaskouski)
   - #3971, ST_ClusterKMeans now uses better initial seed (Darafei Praliaskouski)
+  - #3977, ST_ClusterKMeans is now faster and simpler (Darafei Praliaskouski)
 
 PostGIS 2.4.0
 2017/09/30

Modified: trunk/liblwgeom/Makefile.in
===================================================================
--- trunk/liblwgeom/Makefile.in	2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/Makefile.in	2018-01-10 19:27:48 UTC (rev 16251)
@@ -115,7 +115,6 @@
 	lwunionfind.o \
 	effectivearea.o \
 	lwkmeans.o \
-	kmeans.o \
 	varint.o
 
 NM_OBJS = \
@@ -179,9 +178,6 @@
 ../postgis_svn_revision.h:
 	$(MAKE) -C .. postgis_svn_revision.h
 
-#liblwgeom.a: $(SA_OBJS) $(NM_OBJS) $(SA_HEADERS)
-#ar rs liblwgeom.a $(SA_OBJS) $(NM_OBJS)
-
 liblwgeom.la: $(LT_OBJS)
 	$(LIBTOOL) --tag=CC --mode=link $(CC) -rpath $(libdir) $(LT_OBJS) \
              -release $(SOVER) -version-info $(VERSION_INFO) $(LDFLAGS) -o $@

Deleted: trunk/liblwgeom/kmeans.c
===================================================================
--- trunk/liblwgeom/kmeans.c	2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/kmeans.c	2018-01-10 19:27:48 UTC (rev 16251)
@@ -1,317 +0,0 @@
-/*-------------------------------------------------------------------------
-*
-* kmeans.c
-*    Generic k-means implementation
-*
-* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
-*
-*------------------------------------------------------------------------*/
-
-#include <assert.h>
-#include <float.h>
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "kmeans.h"
-
-#ifdef KMEANS_THREADED
-#include <pthread.h>
-#endif
-
-static void
-update_r(kmeans_config *config)
-{
-	int i;
-
-	for (i = 0; i < config->num_objs; i++)
-	{
-		double distance, curr_distance;
-		int cluster, curr_cluster;
-		Pointer obj;
-
-		assert(config->objs != NULL);
-		assert(config->num_objs > 0);
-		assert(config->centers);
-		assert(config->clusters);
-
-		obj = config->objs[i];
-
-		/*
-		* Don't try to cluster NULL objects, just add them
-		* to the "unclusterable cluster"
-		*/
-		if (!obj)
-		{
-			config->clusters[i] = KMEANS_NULL_CLUSTER;
-			continue;
-		}
-
-		/* Initialize with distance to first cluster */
-		curr_distance = (config->distance_method)(obj, config->centers[0]);
-		curr_cluster = 0;
-
-		/* Check all other cluster centers and find the nearest */
-		for (cluster = 1; cluster < config->k; cluster++)
-		{
-			distance = (config->distance_method)(obj, config->centers[cluster]);
-			if (distance < curr_distance)
-			{
-				curr_distance = distance;
-				curr_cluster = cluster;
-			}
-		}
-
-		/* Store the nearest cluster this object is in */
-		config->clusters[i] = curr_cluster;
-	}
-}
-
-static void
-update_means(kmeans_config *config)
-{
-	int i;
-
-	for (i = 0; i < config->k; i++)
-	{
-		/* Update the centroid for this cluster */
-		(config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
-	}
-}
-
-#ifdef KMEANS_THREADED
-
-static void * update_r_threaded_main(void *args)
-{
-	kmeans_config *config = (kmeans_config*)args;
-	update_r(config);
-	pthread_exit(args);
-}
-
-static void update_r_threaded(kmeans_config *config)
-{
-	/* Computational complexity is function of objs/clusters */
-	/* We only spin up threading infra if we need more than one core */
-	/* running. We keep the threshold high so the overhead of */
-	/* thread management is small compared to thread compute time */
-	int num_threads = config->num_objs * config->k / KMEANS_THR_THRESHOLD;
-
-	/* Can't run more threads than the maximum */
-	num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
-
-	/* If the problem size is small, don't bother w/ threading */
-	if (num_threads < 1)
-	{
-		update_r(config);
-	}
-	else
-	{
-		pthread_t thread[KMEANS_THR_MAX];
-	    pthread_attr_t thread_attr;
-		kmeans_config thread_config[KMEANS_THR_MAX];
-		int obs_per_thread = config->num_objs / num_threads;
-	    int i, rc;
-
-		for (i = 0; i < num_threads; i++)
-		{
-			/*
-			* Each thread gets a copy of the config, but with the list pointers
-			* offset to the start of the batch the thread is responsible for, and the
-			* object count number adjusted similarly.
-			*/
-			memcpy(&(thread_config[i]), config, sizeof(kmeans_config));
-			thread_config[i].objs += i*obs_per_thread;
-			thread_config[i].clusters += i*obs_per_thread;
-			thread_config[i].num_objs = obs_per_thread;
-			if (i == num_threads-1)
-			{
-				thread_config[i].num_objs += config->num_objs - num_threads*obs_per_thread;
-			}
-
-		    /* Initialize and set thread detached attribute */
-		    pthread_attr_init(&thread_attr);
-		    pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
-
-			/* Now we just run the thread, on its subset of the data */
-			rc = pthread_create(&thread[i], &thread_attr, update_r_threaded_main, (void *) &thread_config[i]);
-			if (rc)
-			{
-				printf("ERROR: return code from pthread_create() is %d\n", rc);
-				exit(-1);
-			}
-		}
-
-	    /* Free attribute and wait for the other threads */
-	    pthread_attr_destroy(&thread_attr);
-
-		/* Wait for all calculations to complete */
-		for (i = 0; i < num_threads; i++)
-		{
-		    void *status;
-			rc = pthread_join(thread[i], &status);
-			if (rc)
-			{
-				printf("ERROR: return code from pthread_join() is %d\n", rc);
-				exit(-1);
-			}
-		}
-	}
-}
-
-int update_means_k;
-pthread_mutex_t update_means_k_mutex;
-
-static void *
-update_means_threaded_main(void *arg)
-{
-	kmeans_config *config = (kmeans_config*)arg;
-	int i = 0;
-
-	do
-	{
-		pthread_mutex_lock (&update_means_k_mutex);
-		i = update_means_k;
-		update_means_k++;
-		pthread_mutex_unlock (&update_means_k_mutex);
-
-		if (i < config->k)
-			(config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
-	}
-	while (i < config->k);
-
-	pthread_exit(arg);
-}
-
-static void
-update_means_threaded(kmeans_config *config)
-{
-	/* We only spin up threading infra if we need more than one core */
-	/* running. We keep the threshold high so the overhead of */
-	/* thread management is small compared to thread compute time */
-	int num_threads = config->num_objs / KMEANS_THR_THRESHOLD;
-
-	/* Can't run more threads than the maximum */
-	num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
-
-	/* If the problem size is small, don't bother w/ threading */
-	if (num_threads < 1)
-	{
-		update_means(config);
-	}
-	else
-	{
-		/* Mutex protected counter to drive threads */
-		pthread_t thread[KMEANS_THR_MAX];
-		pthread_attr_t thread_attr;
-		int i, rc;
-
-		pthread_mutex_init(&update_means_k_mutex, NULL);
-		update_means_k = 0;
-
-		pthread_attr_init(&thread_attr);
-		pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
-
-		/* Create threads to perform computation  */
-		for (i = 0; i < num_threads; i++)
-		{
-
-			/* Now we just run the thread, on its subset of the data */
-			rc = pthread_create(&thread[i], &thread_attr, update_means_threaded_main, (void *) config);
-			if (rc)
-			{
-				printf("ERROR: return code from pthread_create() is %d\n", rc);
-				exit(-1);
-			}
-		}
-
-		pthread_attr_destroy(&thread_attr);
-
-		/* Watch until completion  */
-		for (i = 0; i < num_threads; i++)
-		{
-		    void *status;
-			rc = pthread_join(thread[i], &status);
-			if (rc)
-			{
-				printf("ERROR: return code from pthread_join() is %d\n", rc);
-				exit(-1);
-			}
-		}
-
-		pthread_mutex_destroy(&update_means_k_mutex);
-	}
-}
-
-#endif /* KMEANS_THREADED */
-
-kmeans_result
-kmeans(kmeans_config *config)
-{
-	int iterations = 0;
-	int *clusters_last;
-	size_t clusters_sz = sizeof(int)*config->num_objs;
-
-	assert(config);
-	assert(config->objs);
-	assert(config->num_objs);
-	assert(config->distance_method);
-	assert(config->centroid_method);
-	assert(config->centers);
-	assert(config->k);
-	assert(config->clusters);
-	assert(config->k <= config->num_objs);
-
-	/* Zero out cluster numbers, just in case user forgets */
-	memset(config->clusters, 0, clusters_sz);
-
-	/* Set default max iterations if necessary */
-	if (!config->max_iterations)
-		config->max_iterations = KMEANS_MAX_ITERATIONS;
-
-	/*
-	 * Previous cluster state array. At this time, r doesn't mean anything
-	 * but it's ok
-	 */
-	clusters_last = kmeans_malloc(clusters_sz);
-
-	while (1)
-	{
-		LW_ON_INTERRUPT(kmeans_free(clusters_last); return KMEANS_ERROR);
-
-		/* Store the previous state of the clustering */
-		memcpy(clusters_last, config->clusters, clusters_sz);
-
-#ifdef KMEANS_THREADED
-		update_r_threaded(config);
-		update_means_threaded(config);
-#else
-		update_r(config);
-		update_means(config);
-#endif
-		/*
-		 * if all the cluster numbers are unchanged since last time,
-		 * we are at a stable solution, so we can stop here
-		 */
-		if (memcmp(clusters_last, config->clusters, clusters_sz) == 0)
-		{
-			kmeans_free(clusters_last);
-			config->total_iterations = iterations;
-			return KMEANS_OK;
-		}
-
-		if (iterations++ > config->max_iterations)
-		{
-			kmeans_free(clusters_last);
-			config->total_iterations = iterations;
-			return KMEANS_EXCEEDED_MAX_ITERATIONS;
-		}
-
-	}
-
-	kmeans_free(clusters_last);
-	config->total_iterations = iterations;
-	return KMEANS_ERROR;
-}
-
-

Deleted: trunk/liblwgeom/kmeans.h
===================================================================
--- trunk/liblwgeom/kmeans.h	2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/kmeans.h	2018-01-10 19:27:48 UTC (rev 16251)
@@ -1,126 +0,0 @@
-/*-------------------------------------------------------------------------
-*
-* kmeans.h
-*    Generic k-means implementation
-*
-* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
-*
-*------------------------------------------------------------------------*/
-
-
-#include <stdlib.h>
-#include "liblwgeom_internal.h"
-
-/*
-* Simple k-means implementation for arbitrary data structures
-*
-* Since k-means partitions based on inter-object "distance" the same
-* machinery can be used to support any object type that can calculate a
-* "distance" between pairs.
-*
-* To use the k-means infrastructure, just fill out the kmeans_config
-* structure and invoke the kmeans() function.
-*/
-
-/*
-* Threaded calculation is available using pthreads, which practically
-* means UNIX platforms only, unless you're building with a posix
-* compatible environment.
-*
-* #define KMEANS_THREADED
-*/
-
-/*
-* When clustering lists with NULL elements, they will get this as
-* their cluster number. (All the other clusters will be non-negative)
-*/
-#define KMEANS_NULL_CLUSTER -1
-
-/*
-* If the algorithm doesn't converge within this number of iterations,
-* it will return with a failure error code.
-*/
-#define KMEANS_MAX_ITERATIONS 1000
-
-/*
-* The code doesn't try to figure out how many threads to use, so
-* best to set this to the number of cores you expect to have
-* available. The threshold is the the value of k*n at which to
-* move to multi-threading.
-*/
-#ifdef KMEANS_THREADED
-#define KMEANS_THR_MAX 4
-#define KMEANS_THR_THRESHOLD 250000
-#endif
-
-#define kmeans_malloc(size) lwalloc(size)
-#define kmeans_free(ptr) lwfree(ptr)
-
-typedef void * Pointer;
-
-typedef enum {
-	KMEANS_OK,
-	KMEANS_EXCEEDED_MAX_ITERATIONS,
-	KMEANS_ERROR
-} kmeans_result;
-
-/*
-* Prototype for the distance calculating function
-*/
-typedef double (*kmeans_distance_method) (const Pointer a, const Pointer b);
-
-/*
-* Prototype for the centroid calculating function
-* @param objs the list of all objects in the calculation
-* @param clusters the list of cluster numbers for each object
-* @param num_objs the number of objects/cluster numbers in the previous arrays
-* @param cluster the cluster number we are actually generating a centroid for here
-* @param centroid the object to write the centroid result into (already allocated)
-*/
-typedef void (*kmeans_centroid_method) (const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid);
-
-typedef struct kmeans_config
-{
-	/* Function returns the "distance" between any pair of objects */
-	kmeans_distance_method distance_method;
-
-	/* Function returns the "centroid" of a collection of objects */
-	kmeans_centroid_method centroid_method;
-
-	/* An array of objects to be analyzed. User allocates this array */
-	/* and is responsible for freeing it. */
-	/* For objects that are not capable of participating in the distance */
-	/* calculations, but for which you still want included in the process */
-	/* (for examples, database nulls, or geometry empties) use a NULL */
-	/* value in this list. All NULL values will be returned in the */
-	/* KMEANS_NULL_CLUSTER. */
-	Pointer * objs;
-
-	/* Number of objects in the preceding array */
-	size_t num_objs;
-
-	/* An array of initial centers for the algorithm */
-	/* Can be randomly assigned, or using proportions, */
-	/* unfortunately the algorithm is sensitive to starting */
-	/* points, so using a "better" set of starting points */
-	/* might be wise. User allocates and is responsible for freeing. */
-	Pointer * centers;
-
-	/* Number of means we are calculating, length of preceding array */
-	unsigned int k;
-
-	/* Maximum number of times to iterate the algorithm, or 0 for */
-	/* library default */
-	unsigned int max_iterations;
-
-	/* Iteration counter */
-	unsigned int total_iterations;
-
-	/* Array to fill in with cluster numbers. User allocates and frees. */
-	int * clusters;
-
-} kmeans_config;
-
-/* This is where the magic happens. */
-kmeans_result kmeans(kmeans_config *config);
-

Modified: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c	2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/liblwgeom/lwkmeans.c	2018-01-10 19:27:48 UTC (rev 16251)
@@ -1,130 +1,206 @@
+/*-------------------------------------------------------------------------
+ *
+ * Copyright (c) 2018, Darafei Praliaskouski <me at komzpa.net>
+ * Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
+ *
+ *------------------------------------------------------------------------*/
+
 #include <float.h>
 #include <math.h>
 
-#include "kmeans.h"
 #include "liblwgeom_internal.h"
 
-static double lwkmeans_pt_distance(const Pointer a, const Pointer b)
+/*
+ * When clustering lists with NULL elements, they will get this as
+ * their cluster number. (All the other clusters will be non-negative)
+ */
+#define KMEANS_NULL_CLUSTER -1
+
+/*
+ * If the algorithm doesn't converge within this number of iterations,
+ * it will return with a failure error code.
+ */
+#define KMEANS_MAX_ITERATIONS 1000
+
+static void
+update_r(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, unsigned int k)
 {
-	POINT2D *pa = (POINT2D*)a;
-	POINT2D *pb = (POINT2D*)b;
+	POINT2D* obj;
+	unsigned int i;
+	double distance, curr_distance;
+	int cluster, curr_cluster;
 
-	double dx = (pa->x - pb->x);
-	double dy = (pa->y - pb->y);
+	for (i = 0; i < n; i++)
+	{
+		obj = objs[i];
 
-	return dx*dx + dy*dy;
+		/* Don't try to cluster NULL objects, just add them to the "unclusterable cluster" */
+		if (!obj)
+		{
+			clusters[i] = KMEANS_NULL_CLUSTER;
+			continue;
+		}
+
+		/* Initialize with distance to first cluster */
+		curr_distance = distance2d_sqr_pt_pt(obj, centers[0]);
+		curr_cluster = 0;
+
+		/* Check all other cluster centers and find the nearest */
+		for (cluster = 1; cluster < k; cluster++)
+		{
+			distance = distance2d_sqr_pt_pt(obj, centers[cluster]);
+			if (distance < curr_distance)
+			{
+				curr_distance = distance;
+				curr_cluster = cluster;
+			}
+		}
+
+		/* Store the nearest cluster this object is in */
+		clusters[i] = curr_cluster;
+	}
 }
 
-static void lwkmeans_pt_centroid(const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid)
+static void
+update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, unsigned int* weights, unsigned int k)
 {
-	int i;
-	int num_cluster = 0;
-	POINT2D sum;
-	POINT2D **pts = (POINT2D**)objs;
-	POINT2D *center = (POINT2D*)centroid;
+	unsigned int i;
 
-	sum.x = sum.y = 0.0;
-
-	if (num_objs <= 0) return;
-
-	for (i = 0; i < num_objs; i++)
+	memset(weights, 0, sizeof(int) * k);
+	for (i = 0; i < k; i++)
 	{
-		/* Skip points that are not of interest */
-		if (clusters[i] != cluster) continue;
-
-		sum.x += pts[i]->x;
-		sum.y += pts[i]->y;
-		num_cluster++;
+		centers[i]->x = 0.0;
+		centers[i]->y = 0.0;
 	}
-	if (num_cluster)
+	for (i = 0; i < n; i++)
 	{
-		sum.x /= num_cluster;
-		sum.y /= num_cluster;
-		*center = sum;
+		centers[clusters[i]]->x += objs[i]->x;
+		centers[clusters[i]]->y += objs[i]->y;
+		weights[clusters[i]] += 1;
 	}
-	return;
+	for (i = 0; i < k; i++)
+	{
+		centers[i]->x /= weights[i];
+		centers[i]->y /= weights[i];
+	}
 }
 
-
-int *
-lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, int ngeoms, int k)
+static int
+kmeans(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, unsigned int k)
 {
-	int i;
-	int num_centroids = 0;
-	LWGEOM **centroids;
-	POINT2D *centers_raw;
-	double *distances;
-	const POINT2D *cp;
-	kmeans_config config;
-	kmeans_result result;
-	int boundary_point_idx = 0;
-	double xmin = DBL_MAX;
+	unsigned int i = 0;
+	int* clusters_last;
+	int converged = LW_FALSE;
+	uint32_t clusters_sz = sizeof(int) * n;
+	unsigned int* weights;
 
-	assert(k>0);
-	assert(ngeoms>0);
-	assert(geoms);
+	weights = lwalloc(sizeof(int) * k);
 
-	/* Initialize our static structs */
-	memset(&config, 0, sizeof(kmeans_config));
-	memset(&result, 0, sizeof(kmeans_result));
+	/*
+	 * Previous cluster state array. At this time, r doesn't mean anything
+	 * but it's ok
+	 */
+	clusters_last = lwalloc(clusters_sz);
 
-	if (ngeoms < k)
+	for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
 	{
-		lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
+		LW_ON_INTERRUPT(break);
+
+		/* Store the previous state of the clustering */
+		memcpy(clusters_last, clusters, clusters_sz);
+
+		update_r(objs, clusters, n, centers, k);
+		update_means(objs, clusters, n, centers, weights, k);
+
+		/* if all the cluster numbers are unchanged, we are at a stable solution */
+		converged = memcmp(clusters_last, clusters, clusters_sz) == 0;
 	}
 
+	lwfree(clusters_last);
+	lwfree(weights);
+	if (!converged)
+		lwerror("%s did not converge after %d iterations", __func__, i);
+	return converged;
+}
+
+int*
+lwgeom_cluster_2d_kmeans(const LWGEOM** geoms, int n, int k)
+{
+	unsigned int i;
+	unsigned int num_centroids = 0;
+	LWGEOM** centroids;
+	POINT2D* centers_raw;
+	double* distances;
+	const POINT2D* cp;
+	int result = LW_FALSE;
+	unsigned int boundary_point_idx = 0;
+	double max_norm = -DBL_MAX;
+	double curr_norm;
+
+	/* An array of objects to be analyzed.
+	 * All NULL values will be returned in the  KMEANS_NULL_CLUSTER. */
+	POINT2D** objs;
+
+	/* An array of centers for the algorithm. */
+	POINT2D** centers;
+
+	/* Array to fill in with cluster numbers. */
+	int* clusters;
+
+	assert(k > 0);
+	assert(n > 0);
+	assert(geoms);
+
+	if (n < k)
+		lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
+
 	/* We'll hold the temporary centroid objects here */
-	centroids = lwalloc(sizeof(LWGEOM*) * ngeoms);
-	memset(centroids, 0, sizeof(LWGEOM*) * ngeoms);
+	centroids = lwalloc(sizeof(LWGEOM*) * n);
+	memset(centroids, 0, sizeof(LWGEOM*) * n);
 
-	/* The vector of cluster means. We have to allocate a */
-	/* chunk of memory for these because we'll be mutating them */
-	/* in the kmeans algorithm */
+	/* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
+	 * in the kmeans algorithm */
 	centers_raw = lwalloc(sizeof(POINT2D) * k);
 	memset(centers_raw, 0, sizeof(POINT2D) * k);
 
 	/* K-means configuration setup */
-	config.objs = lwalloc(sizeof(Pointer) * ngeoms);
-	config.num_objs = ngeoms;
-	config.clusters = lwalloc(sizeof(int) * ngeoms);
-	config.centers = lwalloc(sizeof(Pointer) * k);
-	config.k = k;
-	config.max_iterations = 0;
-	config.distance_method = lwkmeans_pt_distance;
-	config.centroid_method = lwkmeans_pt_centroid;
+	objs = lwalloc(sizeof(POINT2D*) * n);
+	clusters = lwalloc(sizeof(int) * n);
+	centers = lwalloc(sizeof(POINT2D*) * k);
 
 	/* Clean the memory */
-	memset(config.objs, 0, sizeof(Pointer) * ngeoms);
-	memset(config.clusters, 0, sizeof(int) * ngeoms);
-	memset(config.centers, 0, sizeof(Pointer) * k);
+	memset(objs, 0, sizeof(POINT2D*) * n);
+	memset(clusters, 0, sizeof(int) * n);
+	memset(centers, 0, sizeof(POINT2D*) * k);
 
 	/* Array of sums of distances to a point from accepted cluster centers */
-	distances = lwalloc(sizeof(double)*config.num_objs);
-	memset(distances, 0, sizeof(double)*config.num_objs);
+	distances = lwalloc(sizeof(double) * n);
 
 	/* Prepare the list of object pointers for K-means */
-	for (i = 0; i < ngeoms; i++)
+	for (i = 0; i < n; i++)
 	{
-		const LWGEOM *geom = geoms[i];
-		LWPOINT *lwpoint;
+		const LWGEOM* geom = geoms[i];
+		LWPOINT* lwpoint;
 
 		/* Null/empty geometries get a NULL pointer */
 		if ((!geom) || lwgeom_is_empty(geom))
 		{
-			config.objs[i] = NULL;
+			objs[i] = NULL;
 			/* mark as unreachable */
 			distances[i] = -1;
 			continue;
 		}
 
+		distances[i] = DBL_MAX;
+
 		/* If the input is a point, use its coordinates */
 		/* If its not a point, convert it to one via centroid */
 		if (lwgeom_get_type(geom) != POINTTYPE)
 		{
-			LWGEOM *centroid = lwgeom_centroid(geom);
+			LWGEOM* centroid = lwgeom_centroid(geom);
 			if ((!centroid) || lwgeom_is_empty(centroid))
 			{
-				config.objs[i] = NULL;
+				objs[i] = NULL;
 				continue;
 			}
 			centroids[num_centroids++] = centroid;
@@ -137,40 +213,46 @@
 
 		/* Store a pointer to the POINT2D we are interested in */
 		cp = getPoint2d_cp(lwpoint->point, 0);
-		config.objs[i] = (Pointer)cp;
+		objs[i] = (POINT2D*)cp;
 
-		/* Find the point that's on the boundary to use as seed */
-		if (xmin > cp->x)
+		/* Find the point with largest Euclidean norm to use as seed */
+		curr_norm = (cp->x) * (cp->x) + (cp->y) * (cp->y);
+		if (curr_norm > max_norm)
 		{
 			boundary_point_idx = i;
-			xmin = cp->x;
+			max_norm = curr_norm;
 		}
 	}
 
-	if (xmin == DBL_MAX)
+	if (max_norm == -DBL_MAX)
 	{
 		lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
 	}
 
 	/* start with point on boundary */
 	distances[boundary_point_idx] = -1;
-	centers_raw[0] = *((POINT2D*)config.objs[boundary_point_idx]);
-	config.centers[0] = &(centers_raw[0]);
+	centers_raw[0] = *((POINT2D*)objs[boundary_point_idx]);
+	centers[0] = &(centers_raw[0]);
 	/* loop i on clusters, skip 0 as it's found already */
 	for (i = 1; i < k; i++)
 	{
-		int j;
+		unsigned int j;
 		double max_distance = -DBL_MAX;
-		int candidate_center = 0;
+		double curr_distance;
+		unsigned int candidate_center = 0;
 
 		/* loop j on objs */
-		for (j = 0; j < config.num_objs; j++)
+		for (j = 0; j < n; j++)
 		{
 			/* empty objs and accepted clusters are already marked with distance = -1 */
-			if (distances[j] < 0) continue;
+			if (distances[j] < 0)
+				continue;
 
-			/* greedily take a point that's farthest from all accepted clusters */
-			distances[j] += lwkmeans_pt_distance(&config.objs[j], &centers_raw[i-1]);
+			/* update distance to closest cluster */
+			curr_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
+			distances[j] = fmin(curr_distance, distances[j]);
+
+			/* greedily take a point that's farthest from any of accepted clusters */
 			if (distances[j] > max_distance)
 			{
 				candidate_center = j;
@@ -184,34 +266,26 @@
 
 		/* accept candidtate to centers */
 		distances[candidate_center] = -1;
-		/* Copy the point coordinates into the initial centers array */
-		/* This is ugly, but the centers array is an array of */
-		/* pointers to points, not an array of points */
-		centers_raw[i] = *((POINT2D*)config.objs[candidate_center]);
-		config.centers[i] = &(centers_raw[i]);
+		/* Copy the point coordinates into the initial centers array
+		 * This is ugly, but the centers array is an array of pointers to points, not an array of points */
+		centers_raw[i] = *((POINT2D*)objs[candidate_center]);
+		centers[i] = &(centers_raw[i]);
 	}
 
-	result = kmeans(&config);
+	result = kmeans(objs, clusters, n, centers, k);
 
 	/* Before error handling, might as well clean up all the inputs */
-	lwfree(config.objs);
-	lwfree(config.centers);
+	lwfree(objs);
+	lwfree(centers);
 	lwfree(centers_raw);
 	lwfree(centroids);
 	lwfree(distances);
 
 	/* Good result */
-	if (result == KMEANS_OK)
-		return config.clusters;
+	if (result)
+		return clusters;
 
 	/* Bad result, not going to need the answer */
-	lwfree(config.clusters);
-	if (result == KMEANS_EXCEEDED_MAX_ITERATIONS)
-	{
-		lwerror("%s did not converge after %d iterations", __func__, config.max_iterations);
-		return NULL;
-	}
-
-	/* Unknown error */
+	lwfree(clusters);
 	return NULL;
-}
+}
\ No newline at end of file

Modified: trunk/regress/lwgeom_regress.sql
===================================================================
--- trunk/regress/lwgeom_regress.sql	2018-01-10 14:56:22 UTC (rev 16250)
+++ trunk/regress/lwgeom_regress.sql	2018-01-10 19:27:48 UTC (rev 16251)
@@ -5,9 +5,6 @@
     wkb_ndr text
 );
 
-
-
-
 INSERT INTO test_data VALUES
 (1, 'MULTIPOINT(1 2)', '00000000040000000100000000013FF00000000000004000000000000000', '0104000000010000000101000000000000000000F03F0000000000000040'),
 (-5, 'LINESTRING(1 2,3 4)', '0000000002000000023FF0000000000000400000000000000040080000000000004010000000000000', '010200000002000000000000000000F03F000000000000004000000000000008400000000000001040'),
@@ -190,7 +187,7 @@
          from points) z;
 
 -- check that grid gets clustered to clusters of similar size
-select '#3971', count(*) >= 12 -- in perfect match it's 25, #3971 increases it to 12 from 4
+select '#3971', count(*) between 16 and 25 -- in perfect match it's 25, better kmeans init can improve
 from (
          with
                  points as (



More information about the postgis-tickets mailing list