[postgis-tickets] r14673 - #3465, ST_Kmeans

Paul Ramsey pramsey at cleverelephant.ca
Wed Feb 24 02:44:12 PST 2016


Author: pramsey
Date: 2016-02-24 02:44:11 -0800 (Wed, 24 Feb 2016)
New Revision: 14673

Added:
   trunk/liblwgeom/kmeans.c
   trunk/liblwgeom/kmeans.h
   trunk/liblwgeom/lwkmeans.c
   trunk/postgis/lwgeom_window.c
Modified:
   trunk/NEWS
   trunk/liblwgeom/Makefile.in
   trunk/liblwgeom/cunit/cu_algorithm.c
   trunk/liblwgeom/liblwgeom.h.in
   trunk/liblwgeom/lwgeom_geos.c
   trunk/liblwgeom/stringbuffer.c
   trunk/liblwgeom/stringbuffer.h
   trunk/postgis/Makefile.in
   trunk/postgis/postgis.sql.in
Log:
#3465, ST_Kmeans


Modified: trunk/NEWS
===================================================================
--- trunk/NEWS	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/NEWS	2016-02-24 10:44:11 UTC (rev 14673)
@@ -12,6 +12,7 @@
   - #2259 ST_Voronoi (Dan Baston)
   - #3339 ST_GeneratePoints (Paul Ramsey)
   - #3428 ST_Points (Dan Baston)
+  - #3465 ST_KMeans (Paul Ramsey)
 
 PostGIS 2.2.1
 2016/01/06

Modified: trunk/liblwgeom/Makefile.in
===================================================================
--- trunk/liblwgeom/Makefile.in	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/Makefile.in	2016-02-24 10:44:11 UTC (rev 14673)
@@ -113,6 +113,8 @@
 	lwgeom_transform.o \
 	lwunionfind.o \
 	effectivearea.o \
+	lwkmeans.o \
+	kmeans.o \
 	varint.o
 
 NM_OBJS = \

Modified: trunk/liblwgeom/cunit/cu_algorithm.c
===================================================================
--- trunk/liblwgeom/cunit/cu_algorithm.c	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/cunit/cu_algorithm.c	2016-02-24 10:44:11 UTC (rev 14673)
@@ -1023,7 +1023,47 @@
 	lwgeom_free(geom);
 }
 
+static void test_kmeans(void)
+{
+	static int cluster_size = 100;
+	static int num_clusters = 4;
+	static double spread = 1.5;
+	int N = cluster_size * num_clusters;
+	LWGEOM **geoms;
+	int i, j, k=0;
+	int *r;
 
+	geoms = lwalloc(sizeof(LWGEOM*) * N);
+
+	for (j = 0; j < num_clusters; j++) {
+		for (i = 0; i < cluster_size; i++)
+		{
+			double u1 = 1.0 * random() / RAND_MAX;
+			double u2 = 1.0 * random() / RAND_MAX;
+			double z1 = spread * j + sqrt(-2*log2(u1))*cos(2*M_PI*u2);
+			double z2 = spread * j + sqrt(-2*log2(u1))*sin(2*M_PI*u2);
+
+			LWPOINT *lwp = lwpoint_make2d(SRID_UNKNOWN, z1, z2);
+			geoms[k++] = lwpoint_as_lwgeom(lwp);
+		}
+	}
+
+	r = lwgeom_cluster_2d_kmeans((const LWGEOM **)geoms, N, num_clusters);
+
+	// for (i = 0; i < k; i++)
+	// {
+	// 	printf("[%d] %s\n", r[i], lwgeom_to_ewkt(geoms[i]));
+	// }
+
+	/* Clean up */
+	lwfree(r);
+	for (i = 0; i < k; i++)
+		lwgeom_free(geoms[i]);
+	lwfree(geoms);
+
+	return;
+}
+
 /*
 ** Used by test harness to register the tests in this file.
 */
@@ -1050,4 +1090,5 @@
 	PG_ADD_TEST(suite,test_lwgeom_simplify);
 	PG_ADD_TEST(suite,test_lw_arc_center);
 	PG_ADD_TEST(suite,test_point_density);
+	PG_ADD_TEST(suite,test_kmeans);
 }

Added: trunk/liblwgeom/kmeans.c
===================================================================
--- trunk/liblwgeom/kmeans.c	                        (rev 0)
+++ trunk/liblwgeom/kmeans.c	2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,316 @@
+/*-------------------------------------------------------------------------
+*
+* kmeans.c
+*    Generic k-means implementation
+*
+* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
+*
+*------------------------------------------------------------------------*/
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "kmeans.h"
+
+#ifdef KMEANS_THREADED
+#include <pthread.h>
+#endif
+
+static void
+update_r(kmeans_config *config)
+{
+	int i;
+
+	for (i = 0; i < config->num_objs; i++)
+	{
+		double distance, curr_distance;
+		int cluster, curr_cluster;
+		Pointer obj;
+
+		assert(config->objs != NULL);
+		assert(config->num_objs > 0);
+		assert(config->centers);
+		assert(config->clusters);
+
+		obj = config->objs[i];
+
+		/*
+		* Don't try to cluster NULL objects, just add them
+		* to the "unclusterable cluster"
+		*/
+		if (!obj)
+		{
+			config->clusters[i] = KMEANS_NULL_CLUSTER;
+			continue;
+		}
+
+		/* Initialize with distance to first cluster */
+		curr_distance = (config->distance_method)(obj, config->centers[0]);
+		curr_cluster = 0;
+
+		/* Check all other cluster centers and find the nearest */
+		for (cluster = 1; cluster < config->k; cluster++)
+		{
+			distance = (config->distance_method)(obj, config->centers[cluster]);
+			if (distance < curr_distance)
+			{
+				curr_distance = distance;
+				curr_cluster = cluster;
+			}
+		}
+
+		/* Store the nearest cluster this object is in */
+		config->clusters[i] = curr_cluster;
+	}
+}
+
+static void
+update_means(kmeans_config *config)
+{
+	int i;
+
+	for (i = 0; i < config->k; i++)
+	{
+		/* Update the centroid for this cluster */
+		(config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
+	}
+}
+
+#ifdef KMEANS_THREADED
+
+static void * update_r_threaded_main(void *args)
+{
+	kmeans_config *config = (kmeans_config*)args;
+	update_r(config);
+	pthread_exit(args);
+}
+
+static void update_r_threaded(kmeans_config *config)
+{
+	/* Computational complexity is function of objs/clusters */
+	/* We only spin up threading infra if we need more than one core */
+	/* running. We keep the threshold high so the overhead of */
+	/* thread management is small compared to thread compute time */
+	int num_threads = config->num_objs * config->k / KMEANS_THR_THRESHOLD;
+
+	/* Can't run more threads than the maximum */
+	num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
+
+	/* If the problem size is small, don't bother w/ threading */
+	if (num_threads < 1)
+	{
+		update_r(config);
+	}
+	else
+	{
+		pthread_t thread[KMEANS_THR_MAX];
+	    pthread_attr_t thread_attr;
+		kmeans_config thread_config[KMEANS_THR_MAX];
+		int obs_per_thread = config->num_objs / num_threads;
+	    int i, rc;
+
+		for (i = 0; i < num_threads; i++)
+		{
+			/*
+			* Each thread gets a copy of the config, but with the list pointers
+			* offest to the start of the batch the thread is responsible for, and the
+			* object count number adjusted similarly.
+			*/
+			memcpy(&(thread_config[i]), config, sizeof(kmeans_config));
+			thread_config[i].objs += i*obs_per_thread;
+			thread_config[i].clusters += i*obs_per_thread;
+			thread_config[i].num_objs = obs_per_thread;
+			if (i == num_threads-1)
+			{
+				thread_config[i].num_objs += config->num_objs - num_threads*obs_per_thread;
+			}
+
+		    /* Initialize and set thread detached attribute */
+		    pthread_attr_init(&thread_attr);
+		    pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+			/* Now we just run the thread, on its subset of the data */
+			rc = pthread_create(&thread[i], &thread_attr, update_r_threaded_main, (void *) &thread_config[i]);
+			if (rc)
+			{
+				printf("ERROR: return code from pthread_create() is %d\n", rc);
+				exit(-1);
+			}
+		}
+
+	    /* Free attribute and wait for the other threads */
+	    pthread_attr_destroy(&thread_attr);
+
+		/* Wait for all calculations to complete */
+		for (i = 0; i < num_threads; i++)
+		{
+		    void *status;
+			rc = pthread_join(thread[i], &status);
+			if (rc)
+			{
+				printf("ERROR: return code from pthread_join() is %d\n", rc);
+				exit(-1);
+			}
+		}
+	}
+}
+
+int update_means_k;
+pthread_mutex_t update_means_k_mutex;
+
+static void *
+update_means_threaded_main(void *arg)
+{
+	kmeans_config *config = (kmeans_config*)arg;
+	int i = 0;
+
+	do
+	{
+		pthread_mutex_lock (&update_means_k_mutex);
+		i = update_means_k;
+		update_means_k++;
+		pthread_mutex_unlock (&update_means_k_mutex);
+
+		if (i < config->k)
+			(config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
+	}
+	while (i < config->k);
+
+	pthread_exit(arg);
+}
+
+static void
+update_means_threaded(kmeans_config *config)
+{
+	/* We only spin up threading infra if we need more than one core */
+	/* running. We keep the threshold high so the overhead of */
+	/* thread management is small compared to thread compute time */
+	int num_threads = config->num_objs / KMEANS_THR_THRESHOLD;
+
+	/* Can't run more threads than the maximum */
+	num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
+
+	/* If the problem size is small, don't bother w/ threading */
+	if (num_threads < 1)
+	{
+		update_means(config);
+	}
+	else
+	{
+		/* Mutex protected counter to drive threads */
+		pthread_t thread[KMEANS_THR_MAX];
+		pthread_attr_t thread_attr;
+		int i, rc;
+
+		pthread_mutex_init(&update_means_k_mutex, NULL);
+		update_means_k = 0;
+
+		pthread_attr_init(&thread_attr);
+		pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+		/* Create threads to perform computation  */
+		for (i = 0; i < num_threads; i++)
+		{
+
+			/* Now we just run the thread, on its subset of the data */
+			rc = pthread_create(&thread[i], &thread_attr, update_means_threaded_main, (void *) config);
+			if (rc)
+			{
+				printf("ERROR: return code from pthread_create() is %d\n", rc);
+				exit(-1);
+			}
+		}
+
+		pthread_attr_destroy(&thread_attr);
+
+		/* Watch until completion  */
+		for (i = 0; i < num_threads; i++)
+		{
+		    void *status;
+			rc = pthread_join(thread[i], &status);
+			if (rc)
+			{
+				printf("ERROR: return code from pthread_join() is %d\n", rc);
+				exit(-1);
+			}
+		}
+
+		pthread_mutex_destroy(&update_means_k_mutex);
+	}
+}
+
+#endif /* KMEANS_THREADED */
+
+kmeans_result
+kmeans(kmeans_config *config)
+{
+	int iterations = 0;
+	int *clusters_last;
+	size_t clusters_sz = sizeof(int)*config->num_objs;
+
+	assert(config);
+	assert(config->objs);
+	assert(config->num_objs);
+	assert(config->distance_method);
+	assert(config->centroid_method);
+	assert(config->centers);
+	assert(config->k);
+	assert(config->clusters);
+
+	/* Zero out cluster numbers, just in case user forgets */
+	memset(config->clusters, 0, clusters_sz);
+
+	/* Set default max iterations if necessary */
+	if (!config->max_iterations)
+		config->max_iterations = KMEANS_MAX_ITERATIONS;
+
+	/*
+	 * Previous cluster state array. At this time, r doesn't mean anything
+	 * but it's ok
+	 */
+	clusters_last = kmeans_malloc(clusters_sz);
+
+	while (1)
+	{
+		LW_ON_INTERRUPT(kmeans_free(clusters_last); return KMEANS_ERROR);
+
+		/* Store the previous state of the clustering */
+		memcpy(clusters_last, config->clusters, clusters_sz);
+
+#ifdef KMEANS_THREADED
+		update_r_threaded(config);
+		update_means_threaded(config);
+#else
+		update_r(config);
+		update_means(config);
+#endif
+		/*
+		 * if all the cluster numbers are unchanged since last time,
+		 * we are at a stable solution, so we can stop here
+		 */
+		if (memcmp(clusters_last, config->clusters, clusters_sz) == 0)
+		{
+			kmeans_free(clusters_last);
+			config->total_iterations = iterations;
+			return KMEANS_OK;
+		}
+
+		if (iterations++ > config->max_iterations)
+		{
+			kmeans_free(clusters_last);
+			config->total_iterations = iterations;
+			return KMEANS_EXCEEDED_MAX_ITERATIONS;
+		}
+
+	}
+
+	kmeans_free(clusters_last);
+	config->total_iterations = iterations;
+	return KMEANS_ERROR;
+}
+
+

Added: trunk/liblwgeom/kmeans.h
===================================================================
--- trunk/liblwgeom/kmeans.h	                        (rev 0)
+++ trunk/liblwgeom/kmeans.h	2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,126 @@
+/*-------------------------------------------------------------------------
+*
+* kmeans.h
+*    Generic k-means implementation
+*
+* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
+*
+*------------------------------------------------------------------------*/
+
+
+#include <stdlib.h>
+#include "liblwgeom_internal.h"
+
+/*
+* Simple k-means implementation for arbitrary data structures
+*
+* Since k-means partitions based on inter-object "distance" the same
+* machinery can be used to support any object type that can calculate a
+* "distance" between pairs.
+*
+* To use the k-means infrastructure, just fill out the kmeans_config
+* structure and invoke the kmeans() function.
+*/
+
+/*
+* Threaded calculation is available using pthreads, which practically
+* means UNIX platforms only, unless you're building with a posix
+* compatible environment.
+*
+* #define KMEANS_THREADED
+*/
+
+/*
+* When clustering lists with NULL elements, they will get this as
+* their cluster number. (All the other clusters will be non-negative)
+*/
+#define KMEANS_NULL_CLUSTER -1
+
+/*
+* If the algorithm doesn't converge within this number of iterations,
+* it will return with a failure error code.
+*/
+#define KMEANS_MAX_ITERATIONS 1000
+
+/*
+* The code doesn't try to figure out how many threads to use, so
+* best to set this to the number of cores you expect to have
+* available. The threshold is the the value of k*n at which to
+* move to multi-threading.
+*/
+#ifdef KMEANS_THREADED
+#define KMEANS_THR_MAX 4
+#define KMEANS_THR_THRESHOLD 250000
+#endif
+
+#define kmeans_malloc(size) lwalloc(size)
+#define kmeans_free(ptr) lwfree(ptr)
+
+typedef void * Pointer;
+
+typedef enum {
+	KMEANS_OK,
+	KMEANS_EXCEEDED_MAX_ITERATIONS,
+	KMEANS_ERROR
+} kmeans_result;
+
+/*
+* Prototype for the distance calculating function
+*/
+typedef double (*kmeans_distance_method) (const Pointer a, const Pointer b);
+
+/*
+* Prototype for the centroid calculating function
+* @param objs the list of all objects in the calculation
+* @param clusters the list of cluster numbers for each object
+* @param num_objs the number of objects/cluster numbers in the previous arrays
+* @param cluster the cluster number we are actually generating a centroid for here
+* @param centroid the object to write the centroid result into (already allocated)
+*/
+typedef void (*kmeans_centroid_method) (const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid);
+
+typedef struct kmeans_config
+{
+	/* Function returns the "distance" between any pair of objects */
+	kmeans_distance_method distance_method;
+
+	/* Function returns the "centroid" of a collection of objects */
+	kmeans_centroid_method centroid_method;
+
+	/* An array of objects to be analyzed. User allocates this array */
+	/* and is responsible for freeing it. */
+	/* For objects that are not capable of participating in the distance */
+	/* calculations, but for which you still want included in the process */
+	/* (for examples, database nulls, or geometry empties) use a NULL */
+	/* value in this list. All NULL values will be returned in the */
+	/* KMEANS_NULL_CLUSTER. */
+	Pointer * objs;
+
+	/* Number of objects in the preceding array */
+	size_t num_objs;
+
+	/* An array of inital centers for the algorithm */
+	/* Can be randomly assigned, or using proportions, */
+	/* unfortunately the algorithm is sensitive to starting */
+	/* points, so using a "better" set of starting points */
+	/* might be wise. User allocates and is responsible for freeing. */
+	Pointer * centers;
+
+	/* Number of means we are calculating, length of preceding array */
+	unsigned int k;
+
+	/* Maximum number of times to iterate the algorithm, or 0 for */
+	/* library default */
+	unsigned int max_iterations;
+
+	/* Iteration counter */
+	unsigned int total_iterations;
+
+	/* Array to fill in with cluster numbers. User allocates and frees. */
+	int * clusters;
+
+} kmeans_config;
+
+/* This is where the magic happens. */
+kmeans_result kmeans(kmeans_config *config);
+

Modified: trunk/liblwgeom/liblwgeom.h.in
===================================================================
--- trunk/liblwgeom/liblwgeom.h.in	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/liblwgeom.h.in	2016-02-24 10:44:11 UTC (rev 14673)
@@ -2143,6 +2143,7 @@
 LWGEOM *lwgeom_intersection(const LWGEOM *geom1, const LWGEOM *geom2);
 LWGEOM *lwgeom_difference(const LWGEOM *geom1, const LWGEOM *geom2);
 LWGEOM *lwgeom_symdifference(const LWGEOM* geom1, const LWGEOM* geom2);
+LWGEOM *lwgeom_centroid(const LWGEOM* geom);
 LWGEOM *lwgeom_union(const LWGEOM *geom1, const LWGEOM *geom2);
 LWGEOM *lwgeom_linemerge(const LWGEOM *geom1);
 LWGEOM *lwgeom_unaryunion(const LWGEOM *geom1);
@@ -2279,5 +2280,16 @@
  */
 LWGEOM* lwgeom_voronoi_diagram(const LWGEOM* g, const GBOX* env, double tolerance, int output_edges);
 
+/**
+* Take a list of LWGEOMs and a number of clusters and return an integer
+* array indicating which cluster each geometry is in.
+*
+* @param geoms the input array of LWGEOM pointers
+* @param ngeoms the number of elements in the array
+* @param k the number of clusters to calculate
+*/
+int * lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, int ngeoms, int k);
+
+
 #endif /* !defined _LIBLWGEOM_H  */
 

Modified: trunk/liblwgeom/lwgeom_geos.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos.c	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/lwgeom_geos.c	2016-02-24 10:44:11 UTC (rev 14673)
@@ -866,6 +866,60 @@
 	return result;
 }
 
+LWGEOM *
+lwgeom_centroid(const LWGEOM* geom)
+{
+	GEOSGeometry *g, *g_centroid;
+	LWGEOM *centroid;
+	int srid, is3d;
+
+	if (lwgeom_is_empty(geom))
+	{
+		LWPOINT *lwp = lwpoint_construct_empty(
+		                   lwgeom_get_srid(geom),
+		                   lwgeom_has_z(geom),
+		                   lwgeom_has_m(geom));
+		return lwpoint_as_lwgeom(lwp);
+	}
+
+	srid = lwgeom_get_srid(geom);
+	is3d = lwgeom_has_z(geom);
+
+	initGEOS(lwnotice, lwgeom_geos_error);
+
+	g = LWGEOM2GEOS(geom, 0);
+
+	if (0 == g)   /* exception thrown at construction */
+	{
+		lwerror("Geometry could not be converted to GEOS: %s", lwgeom_geos_errmsg);
+		return NULL;
+	}
+
+	g_centroid = GEOSGetCentroid(g);
+	GEOSGeom_destroy(g);
+
+	if (g_centroid == NULL)
+	{
+		lwerror("GEOSGetCentroid: %s", lwgeom_geos_errmsg);
+		return NULL; /*never get here */
+	}
+
+	LWDEBUGF(3, "result: %s", GEOSGeomToWKT(g_centroid));
+
+	GEOSSetSRID(g_centroid, srid);
+
+	centroid = GEOS2LWGEOM(g_centroid, is3d);
+	GEOSGeom_destroy(g_centroid);
+
+	if (centroid == NULL)
+	{
+		lwerror("GEOS GEOSGetCentroid() threw an error (result postgis geometry formation)!");
+		return NULL ; /*never get here */
+	}
+
+	return centroid;
+}
+
 LWGEOM*
 lwgeom_union(const LWGEOM *geom1, const LWGEOM *geom2)
 {

Added: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c	                        (rev 0)
+++ trunk/liblwgeom/lwkmeans.c	2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,231 @@
+#include <float.h>
+#include <math.h>
+
+#include "kmeans.h"
+#include "liblwgeom_internal.h"
+
+
+static double lwkmeans_pt_distance(const Pointer a, const Pointer b)
+{
+	POINT2D *pa = (POINT2D*)a;
+	POINT2D *pb = (POINT2D*)b;
+
+	double dx = (pa->x - pb->x);
+	double dy = (pa->y - pb->y);
+
+	return dx*dx + dy*dy;
+}
+
+static int lwkmeans_pt_closest(const Pointer * objs, size_t num_objs, const Pointer a)
+{
+	int i;
+	double d, d_closest;
+	double closest;
+
+	assert(num_objs>0);
+
+	d_closest = lwkmeans_pt_distance(objs[0], a);
+	closest = 0;
+
+	for (i = 1; i < num_objs; i++)
+	{
+		d = lwkmeans_pt_distance(objs[i], a);
+		if (d < d_closest)
+		{
+			d_closest = d;
+			closest = i;
+		}
+	}
+
+	return closest;
+}
+
+static void lwkmeans_pt_centroid(const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid)
+{
+	int i;
+	int num_cluster = 0;
+	POINT2D sum;
+	POINT2D **pts = (POINT2D**)objs;
+	POINT2D *center = (POINT2D*)centroid;
+
+	sum.x = sum.y = 0.0;
+
+	if (num_objs <= 0) return;
+
+	for (i = 0; i < num_objs; i++)
+	{
+		/* Skip points that are not of interest */
+		if (clusters[i] != cluster) continue;
+
+		sum.x += pts[i]->x;
+		sum.y += pts[i]->y;
+		num_cluster++;
+	}
+	if (num_cluster)
+	{
+		sum.x /= num_cluster;
+		sum.y /= num_cluster;
+		*center = sum;
+	}
+	return;
+}
+
+
+int *
+lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, int ngeoms, int k)
+{
+	int i;
+	int num_centroids = 0;
+	LWGEOM **centroids;
+	POINT2D *centers_raw;
+	const POINT2D *cp;
+	POINT2D min, max;
+	double dx, dy;
+	kmeans_config config;
+	kmeans_result result;
+	int *seen;
+	int sidx = 0;
+
+	assert(k>0);
+	assert(ngeoms>0);
+	assert(geoms);
+
+	/* We'll hold the temporary centroid objects here */
+	centroids = lwalloc(sizeof(LWGEOM*) * ngeoms);
+
+	/* The vector of cluster means. We have to allocate a */
+	/* chunk of memory for these because we'll be mutating them */
+	/* in the kmeans algorithm */
+	centers_raw = lwalloc(sizeof(POINT2D) * k);
+
+	/* K-means configuration setup */
+	config.objs = lwalloc(sizeof(Pointer) * ngeoms);
+	config.num_objs = ngeoms;
+	config.clusters = lwalloc(sizeof(int) * ngeoms);
+	config.centers = lwalloc(sizeof(Pointer) * k);
+	config.k = k;
+	config.max_iterations = 0;
+	config.distance_method = lwkmeans_pt_distance;
+	config.centroid_method = lwkmeans_pt_centroid;
+
+	/* Clean the memory */
+	memset(config.objs, 0, sizeof(Pointer) * ngeoms);
+	memset(config.clusters, 0, sizeof(int) * ngeoms);
+	memset(config.centers, 0, sizeof(Pointer) * k);
+
+	/* Prepare the list of object pointers for K-means */
+	for (i = 0; i < ngeoms; i++)
+	{
+		const LWGEOM *geom = geoms[i];
+		LWPOINT *lwpoint;
+
+		/* Null/empty geometries get a NULL pointer */
+		if ((!geom) || lwgeom_is_empty(geom))
+		{
+			config.objs[i] = NULL;
+			continue;
+		}
+
+		/* If the input is a point, use its coordinates */
+		/* If its not a point, convert it to one via centroid */
+		if (lwgeom_get_type(geom) != POINTTYPE)
+		{
+			LWGEOM *centroid = lwgeom_centroid(geom);
+			if ((!centroid) || lwgeom_is_empty(centroid))
+			{
+				config.objs[i] = NULL;
+				continue;
+			}
+			centroids[num_centroids++] = centroid;
+			lwpoint = lwgeom_as_lwpoint(centroid);
+		}
+		else
+		{
+			lwpoint = lwgeom_as_lwpoint(geom);
+		}
+
+		/* Store a pointer to the POINT2D we are interested in */
+		cp = getPoint2d_cp(lwpoint->point, 0);
+		config.objs[i] = (Pointer)cp;
+
+		/* Since we're already here, let's calculate the extrema of the set */
+		if (i == 0)
+		{
+			min = *cp;
+			max = *cp;
+		}
+		else
+		{
+			if (cp->x < min.x) min.x = cp->x;
+			if (cp->y < min.y) min.y = cp->y;
+			if (cp->x > max.x) max.x = cp->x;
+			if (cp->y > max.y) max.y = cp->y;
+		}
+	}
+
+	/*
+	* We map a uniform assignment of points in the area covered by the set
+	* onto actual points in the set
+	*/
+	dx = (max.x - min.x)/k;
+	dy = (max.y - min.y)/k;
+	seen = lwalloc(sizeof(int)*config.k);
+	for (i = 0; i < k; i++)
+	{
+		int closest;
+		POINT2D p;
+		int j;
+
+		/* Calculate a point in the range */
+		p.x = min.x + dx * (i + 0.5);
+		p.y = min.y + dy * (i + 0.5);
+
+		/* Find the data point closest to the calculated point */
+		closest = lwkmeans_pt_closest(config.objs, config.num_objs, &p);
+
+		/* Ensure we aren't already using that point as a seed */
+		j = 0;
+		while(j < sidx)
+		{
+			if (seen[j] == closest)
+			{
+				closest = (closest + 1) % config.num_objs;
+			}
+			else
+			{
+				j++;
+			}
+		}
+		seen[sidx++] = closest;
+
+		/* Copy the point coordinates into the initial centers array */
+		/* This is ugly, but the centers array is an array of */
+		/* pointers to points, not an array of points */
+		centers_raw[i] = *((POINT2D*)config.objs[closest]);
+		config.centers[i] = &(centers_raw[i]);
+	}
+
+	result = kmeans(&config);
+
+	/* Before error handling, might as well clean up all the inputs */
+	lwfree(config.objs);
+	lwfree(config.centers);
+	lwfree(centers_raw);
+	lwfree(centroids);
+
+	/* Good result */
+	if (result == KMEANS_OK)
+		return config.clusters;
+
+	/* Bad result, not going to need the answer */
+	lwfree(config.clusters);
+	if (result == KMEANS_EXCEEDED_MAX_ITERATIONS)
+	{
+		lwerror("%s did not converge after %d iterations", __func__, config.max_iterations);
+		return NULL;
+	}
+
+	/* Unknown error */
+	return NULL;
+}
+

Modified: trunk/liblwgeom/stringbuffer.c
===================================================================
--- trunk/liblwgeom/stringbuffer.c	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/stringbuffer.c	2016-02-24 10:44:11 UTC (rev 14673)
@@ -37,6 +37,27 @@
 	return stringbuffer_create_with_size(STRINGBUFFER_STARTSIZE);
 }
 
+static void
+stringbuffer_init_with_size(stringbuffer_t *s, size_t size)
+{
+	s->str_start = lwalloc(size);
+	s->str_end = s->str_start;
+	s->capacity = size;
+	memset(s->str_start, 0, size);
+}
+
+void
+stringbuffer_release(stringbuffer_t *s)
+{
+	if ( s->str_start ) lwfree(s->str_start);
+}
+
+void
+stringbuffer_init(stringbuffer_t *s)
+{
+	stringbuffer_init_with_size(s, STRINGBUFFER_STARTSIZE);
+}
+
 /**
 * Allocate a new stringbuffer_t. Use stringbuffer_destroy to free.
 */
@@ -46,10 +67,7 @@
 	stringbuffer_t *s;
 
 	s = lwalloc(sizeof(stringbuffer_t));
-	s->str_start = lwalloc(size);
-	s->str_end = s->str_start;
-	s->capacity = size;
-	memset(s->str_start,0,size);
+	stringbuffer_init_with_size(s, size);
 	return s;
 }
 
@@ -59,7 +77,7 @@
 void 
 stringbuffer_destroy(stringbuffer_t *s)
 {
-	if ( s->str_start ) lwfree(s->str_start);
+	stringbuffer_release(s);
 	if ( s ) lwfree(s);
 }
 

Modified: trunk/liblwgeom/stringbuffer.h
===================================================================
--- trunk/liblwgeom/stringbuffer.h	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/stringbuffer.h	2016-02-24 10:44:11 UTC (rev 14673)
@@ -44,6 +44,8 @@
 
 extern stringbuffer_t *stringbuffer_create_with_size(size_t size);
 extern stringbuffer_t *stringbuffer_create(void);
+extern void stringbuffer_init(stringbuffer_t *s);
+extern void stringbuffer_release(stringbuffer_t *s);
 extern void stringbuffer_destroy(stringbuffer_t *sb);
 extern void stringbuffer_clear(stringbuffer_t *sb);
 void stringbuffer_set(stringbuffer_t *sb, const char *s);

Modified: trunk/postgis/Makefile.in
===================================================================
--- trunk/postgis/Makefile.in	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/postgis/Makefile.in	2016-02-24 10:44:11 UTC (rev 14673)
@@ -83,6 +83,7 @@
 	lwgeom_sqlmm.o \
 	lwgeom_rtree.o \
 	lwgeom_transform.o \
+	lwgeom_window.o \
 	gserialized_typmod.o \
 	gserialized_gist_2d.o \
 	gserialized_gist_nd.o \

Added: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c	                        (rev 0)
+++ trunk/postgis/lwgeom_window.c	2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,132 @@
+/**********************************************************************
+ *
+ * PostGIS - Spatial Types for PostgreSQL
+ * http://postgis.net
+ *
+ * PostGIS is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * PostGIS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PostGIS.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************
+ *
+ * Copyright 2016 Paul Ramsey <pramsey at cleverelephant.ca>
+ *
+ **********************************************************************/
+
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "windowapi.h"
+
+#include "../postgis_config.h"
+
+#include "liblwgeom.h"
+#include "lwgeom_pg.h"
+
+extern Datum ST_KMeans(PG_FUNCTION_ARGS);
+
+typedef struct{
+	bool	isdone;
+	bool	isnull;
+	int		result[1];
+	/* variable length */
+} kmeans_context;
+
+PG_FUNCTION_INFO_V1(ST_KMeans);
+Datum ST_KMeans(PG_FUNCTION_ARGS)
+{
+	WindowObject winobj = PG_WINDOW_OBJECT();
+	kmeans_context *context;
+	int64 curpos, rowcount;
+
+	rowcount = WinGetPartitionRowCount(winobj);
+	context = (kmeans_context *)
+		WinGetPartitionLocalMemory(winobj,
+			sizeof(kmeans_context) + sizeof(int) * rowcount);
+
+	if (!context->isdone)
+	{
+		int       i, k, N;
+		bool      isnull, isout;
+		LWGEOM    **geoms;
+		int       *r;
+
+		/* What is K? If it's NULL or invalid, we can't procede */
+		k = DatumGetInt32(WinGetFuncArgCurrent(winobj, 1, &isnull));
+		if (isnull || k <= 0)
+		{
+			context->isdone = true;
+			context->isnull = true;
+			PG_RETURN_NULL();
+		}
+
+		/* We also need a non-zero N */
+		N = (int) WinGetPartitionRowCount(winobj);
+		if (N <= 0)
+		{
+			context->isdone = true;
+			context->isnull = true;
+			PG_RETURN_NULL();
+		}
+
+		/* Read all the geometries from the partition window into a list */
+		geoms = palloc(sizeof(LWGEOM*) * N);
+		for (i = 0; i < N; i++)
+		{
+			GSERIALIZED *g;
+			Datum arg = WinGetFuncArgInPartition(winobj, 0, i,
+						WINDOW_SEEK_HEAD, false, &isnull, &isout);
+
+			/* Null geometries are entered as NULL pointers */
+			if (isnull)
+			{
+				geoms[i] = NULL;
+				continue;
+			}
+
+			g = (GSERIALIZED*)PG_DETOAST_DATUM_COPY(arg);
+			geoms[i] = lwgeom_from_gserialized(g);
+		}
+
+		/* Calculate k-means on the list! */
+		r = lwgeom_cluster_2d_kmeans((const LWGEOM **)geoms, N, k);
+
+		/* Clean up */
+		for (i = 0; i < N; i++)
+			if (geoms[i])
+				lwgeom_free(geoms[i]);
+
+		pfree(geoms);
+
+		if (!r)
+		{
+			context->isdone = true;
+			context->isnull = true;
+			PG_RETURN_NULL();
+		}
+
+		/* Safe the result */
+		memcpy(context->result, r, sizeof(int) * N);
+		pfree(r);
+		context->isdone = true;
+	}
+
+	if (context->isnull)
+		PG_RETURN_NULL();
+
+	curpos = WinGetCurrentPosition(winobj);
+	PG_RETURN_INT32(context->result[curpos]);
+}

Modified: trunk/postgis/postgis.sql.in
===================================================================
--- trunk/postgis/postgis.sql.in	2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/postgis/postgis.sql.in	2016-02-24 10:44:11 UTC (rev 14673)
@@ -3782,9 +3782,15 @@
 	);
 
 
-
 --------------------------------------------------------------------------------
 
+-- Availability: 2.3.0
+CREATE FUNCTION ST_KMeans(geom geometry, k integer)
+  RETURNS integer
+  AS 'MODULE_PATHNAME', 'ST_KMeans'
+  LANGUAGE 'c' VOLATILE STRICT WINDOW;
+
+
 -- Availability: 1.2.2
 CREATE OR REPLACE FUNCTION ST_Relate(geom1 geometry, geom2 geometry)
 	RETURNS text



More information about the postgis-tickets mailing list