[postgis-tickets] r14673 - #3465, ST_Kmeans
Paul Ramsey
pramsey at cleverelephant.ca
Wed Feb 24 02:44:12 PST 2016
Author: pramsey
Date: 2016-02-24 02:44:11 -0800 (Wed, 24 Feb 2016)
New Revision: 14673
Added:
trunk/liblwgeom/kmeans.c
trunk/liblwgeom/kmeans.h
trunk/liblwgeom/lwkmeans.c
trunk/postgis/lwgeom_window.c
Modified:
trunk/NEWS
trunk/liblwgeom/Makefile.in
trunk/liblwgeom/cunit/cu_algorithm.c
trunk/liblwgeom/liblwgeom.h.in
trunk/liblwgeom/lwgeom_geos.c
trunk/liblwgeom/stringbuffer.c
trunk/liblwgeom/stringbuffer.h
trunk/postgis/Makefile.in
trunk/postgis/postgis.sql.in
Log:
#3465, ST_Kmeans
Modified: trunk/NEWS
===================================================================
--- trunk/NEWS 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/NEWS 2016-02-24 10:44:11 UTC (rev 14673)
@@ -12,6 +12,7 @@
- #2259 ST_Voronoi (Dan Baston)
- #3339 ST_GeneratePoints (Paul Ramsey)
- #3428 ST_Points (Dan Baston)
+ - #3465 ST_KMeans (Paul Ramsey)
PostGIS 2.2.1
2016/01/06
Modified: trunk/liblwgeom/Makefile.in
===================================================================
--- trunk/liblwgeom/Makefile.in 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/Makefile.in 2016-02-24 10:44:11 UTC (rev 14673)
@@ -113,6 +113,8 @@
lwgeom_transform.o \
lwunionfind.o \
effectivearea.o \
+ lwkmeans.o \
+ kmeans.o \
varint.o
NM_OBJS = \
Modified: trunk/liblwgeom/cunit/cu_algorithm.c
===================================================================
--- trunk/liblwgeom/cunit/cu_algorithm.c 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/cunit/cu_algorithm.c 2016-02-24 10:44:11 UTC (rev 14673)
@@ -1023,7 +1023,47 @@
lwgeom_free(geom);
}
+static void test_kmeans(void)
+{
+ static int cluster_size = 100;
+ static int num_clusters = 4;
+ static double spread = 1.5;
+ int N = cluster_size * num_clusters;
+ LWGEOM **geoms;
+ int i, j, k=0;
+ int *r;
+ geoms = lwalloc(sizeof(LWGEOM*) * N);
+
+ for (j = 0; j < num_clusters; j++) {
+ for (i = 0; i < cluster_size; i++)
+ {
+ double u1 = 1.0 * random() / RAND_MAX;
+ double u2 = 1.0 * random() / RAND_MAX;
+ double z1 = spread * j + sqrt(-2*log2(u1))*cos(2*M_PI*u2);
+ double z2 = spread * j + sqrt(-2*log2(u1))*sin(2*M_PI*u2);
+
+ LWPOINT *lwp = lwpoint_make2d(SRID_UNKNOWN, z1, z2);
+ geoms[k++] = lwpoint_as_lwgeom(lwp);
+ }
+ }
+
+ r = lwgeom_cluster_2d_kmeans((const LWGEOM **)geoms, N, num_clusters);
+
+ // for (i = 0; i < k; i++)
+ // {
+ // printf("[%d] %s\n", r[i], lwgeom_to_ewkt(geoms[i]));
+ // }
+
+ /* Clean up */
+ lwfree(r);
+ for (i = 0; i < k; i++)
+ lwgeom_free(geoms[i]);
+ lwfree(geoms);
+
+ return;
+}
+
/*
** Used by test harness to register the tests in this file.
*/
@@ -1050,4 +1090,5 @@
PG_ADD_TEST(suite,test_lwgeom_simplify);
PG_ADD_TEST(suite,test_lw_arc_center);
PG_ADD_TEST(suite,test_point_density);
+ PG_ADD_TEST(suite,test_kmeans);
}
Added: trunk/liblwgeom/kmeans.c
===================================================================
--- trunk/liblwgeom/kmeans.c (rev 0)
+++ trunk/liblwgeom/kmeans.c 2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,316 @@
+/*-------------------------------------------------------------------------
+*
+* kmeans.c
+* Generic k-means implementation
+*
+* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
+*
+*------------------------------------------------------------------------*/
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "kmeans.h"
+
+#ifdef KMEANS_THREADED
+#include <pthread.h>
+#endif
+
+static void
+update_r(kmeans_config *config)
+{
+ int i;
+
+ for (i = 0; i < config->num_objs; i++)
+ {
+ double distance, curr_distance;
+ int cluster, curr_cluster;
+ Pointer obj;
+
+ assert(config->objs != NULL);
+ assert(config->num_objs > 0);
+ assert(config->centers);
+ assert(config->clusters);
+
+ obj = config->objs[i];
+
+ /*
+ * Don't try to cluster NULL objects, just add them
+ * to the "unclusterable cluster"
+ */
+ if (!obj)
+ {
+ config->clusters[i] = KMEANS_NULL_CLUSTER;
+ continue;
+ }
+
+ /* Initialize with distance to first cluster */
+ curr_distance = (config->distance_method)(obj, config->centers[0]);
+ curr_cluster = 0;
+
+ /* Check all other cluster centers and find the nearest */
+ for (cluster = 1; cluster < config->k; cluster++)
+ {
+ distance = (config->distance_method)(obj, config->centers[cluster]);
+ if (distance < curr_distance)
+ {
+ curr_distance = distance;
+ curr_cluster = cluster;
+ }
+ }
+
+ /* Store the nearest cluster this object is in */
+ config->clusters[i] = curr_cluster;
+ }
+}
+
+static void
+update_means(kmeans_config *config)
+{
+ int i;
+
+ for (i = 0; i < config->k; i++)
+ {
+ /* Update the centroid for this cluster */
+ (config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
+ }
+}
+
+#ifdef KMEANS_THREADED
+
+static void * update_r_threaded_main(void *args)
+{
+ kmeans_config *config = (kmeans_config*)args;
+ update_r(config);
+ pthread_exit(args);
+}
+
+static void update_r_threaded(kmeans_config *config)
+{
+ /* Computational complexity is function of objs/clusters */
+ /* We only spin up threading infra if we need more than one core */
+ /* running. We keep the threshold high so the overhead of */
+ /* thread management is small compared to thread compute time */
+ int num_threads = config->num_objs * config->k / KMEANS_THR_THRESHOLD;
+
+ /* Can't run more threads than the maximum */
+ num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
+
+ /* If the problem size is small, don't bother w/ threading */
+ if (num_threads < 1)
+ {
+ update_r(config);
+ }
+ else
+ {
+ pthread_t thread[KMEANS_THR_MAX];
+ pthread_attr_t thread_attr;
+ kmeans_config thread_config[KMEANS_THR_MAX];
+ int obs_per_thread = config->num_objs / num_threads;
+ int i, rc;
+
+ for (i = 0; i < num_threads; i++)
+ {
+ /*
+ * Each thread gets a copy of the config, but with the list pointers
+ * offest to the start of the batch the thread is responsible for, and the
+ * object count number adjusted similarly.
+ */
+ memcpy(&(thread_config[i]), config, sizeof(kmeans_config));
+ thread_config[i].objs += i*obs_per_thread;
+ thread_config[i].clusters += i*obs_per_thread;
+ thread_config[i].num_objs = obs_per_thread;
+ if (i == num_threads-1)
+ {
+ thread_config[i].num_objs += config->num_objs - num_threads*obs_per_thread;
+ }
+
+ /* Initialize and set thread detached attribute */
+ pthread_attr_init(&thread_attr);
+ pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+ /* Now we just run the thread, on its subset of the data */
+ rc = pthread_create(&thread[i], &thread_attr, update_r_threaded_main, (void *) &thread_config[i]);
+ if (rc)
+ {
+ printf("ERROR: return code from pthread_create() is %d\n", rc);
+ exit(-1);
+ }
+ }
+
+ /* Free attribute and wait for the other threads */
+ pthread_attr_destroy(&thread_attr);
+
+ /* Wait for all calculations to complete */
+ for (i = 0; i < num_threads; i++)
+ {
+ void *status;
+ rc = pthread_join(thread[i], &status);
+ if (rc)
+ {
+ printf("ERROR: return code from pthread_join() is %d\n", rc);
+ exit(-1);
+ }
+ }
+ }
+}
+
+int update_means_k;
+pthread_mutex_t update_means_k_mutex;
+
+static void *
+update_means_threaded_main(void *arg)
+{
+ kmeans_config *config = (kmeans_config*)arg;
+ int i = 0;
+
+ do
+ {
+ pthread_mutex_lock (&update_means_k_mutex);
+ i = update_means_k;
+ update_means_k++;
+ pthread_mutex_unlock (&update_means_k_mutex);
+
+ if (i < config->k)
+ (config->centroid_method)(config->objs, config->clusters, config->num_objs, i, config->centers[i]);
+ }
+ while (i < config->k);
+
+ pthread_exit(arg);
+}
+
+static void
+update_means_threaded(kmeans_config *config)
+{
+ /* We only spin up threading infra if we need more than one core */
+ /* running. We keep the threshold high so the overhead of */
+ /* thread management is small compared to thread compute time */
+ int num_threads = config->num_objs / KMEANS_THR_THRESHOLD;
+
+ /* Can't run more threads than the maximum */
+ num_threads = (num_threads > KMEANS_THR_MAX ? KMEANS_THR_MAX : num_threads);
+
+ /* If the problem size is small, don't bother w/ threading */
+ if (num_threads < 1)
+ {
+ update_means(config);
+ }
+ else
+ {
+ /* Mutex protected counter to drive threads */
+ pthread_t thread[KMEANS_THR_MAX];
+ pthread_attr_t thread_attr;
+ int i, rc;
+
+ pthread_mutex_init(&update_means_k_mutex, NULL);
+ update_means_k = 0;
+
+ pthread_attr_init(&thread_attr);
+ pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+ /* Create threads to perform computation */
+ for (i = 0; i < num_threads; i++)
+ {
+
+ /* Now we just run the thread, on its subset of the data */
+ rc = pthread_create(&thread[i], &thread_attr, update_means_threaded_main, (void *) config);
+ if (rc)
+ {
+ printf("ERROR: return code from pthread_create() is %d\n", rc);
+ exit(-1);
+ }
+ }
+
+ pthread_attr_destroy(&thread_attr);
+
+ /* Watch until completion */
+ for (i = 0; i < num_threads; i++)
+ {
+ void *status;
+ rc = pthread_join(thread[i], &status);
+ if (rc)
+ {
+ printf("ERROR: return code from pthread_join() is %d\n", rc);
+ exit(-1);
+ }
+ }
+
+ pthread_mutex_destroy(&update_means_k_mutex);
+ }
+}
+
+#endif /* KMEANS_THREADED */
+
+kmeans_result
+kmeans(kmeans_config *config)
+{
+ int iterations = 0;
+ int *clusters_last;
+ size_t clusters_sz = sizeof(int)*config->num_objs;
+
+ assert(config);
+ assert(config->objs);
+ assert(config->num_objs);
+ assert(config->distance_method);
+ assert(config->centroid_method);
+ assert(config->centers);
+ assert(config->k);
+ assert(config->clusters);
+
+ /* Zero out cluster numbers, just in case user forgets */
+ memset(config->clusters, 0, clusters_sz);
+
+ /* Set default max iterations if necessary */
+ if (!config->max_iterations)
+ config->max_iterations = KMEANS_MAX_ITERATIONS;
+
+ /*
+ * Previous cluster state array. At this time, r doesn't mean anything
+ * but it's ok
+ */
+ clusters_last = kmeans_malloc(clusters_sz);
+
+ while (1)
+ {
+ LW_ON_INTERRUPT(kmeans_free(clusters_last); return KMEANS_ERROR);
+
+ /* Store the previous state of the clustering */
+ memcpy(clusters_last, config->clusters, clusters_sz);
+
+#ifdef KMEANS_THREADED
+ update_r_threaded(config);
+ update_means_threaded(config);
+#else
+ update_r(config);
+ update_means(config);
+#endif
+ /*
+ * if all the cluster numbers are unchanged since last time,
+ * we are at a stable solution, so we can stop here
+ */
+ if (memcmp(clusters_last, config->clusters, clusters_sz) == 0)
+ {
+ kmeans_free(clusters_last);
+ config->total_iterations = iterations;
+ return KMEANS_OK;
+ }
+
+ if (iterations++ > config->max_iterations)
+ {
+ kmeans_free(clusters_last);
+ config->total_iterations = iterations;
+ return KMEANS_EXCEEDED_MAX_ITERATIONS;
+ }
+
+ }
+
+ kmeans_free(clusters_last);
+ config->total_iterations = iterations;
+ return KMEANS_ERROR;
+}
+
+
Added: trunk/liblwgeom/kmeans.h
===================================================================
--- trunk/liblwgeom/kmeans.h (rev 0)
+++ trunk/liblwgeom/kmeans.h 2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,126 @@
+/*-------------------------------------------------------------------------
+*
+* kmeans.h
+* Generic k-means implementation
+*
+* Copyright (c) 2016, Paul Ramsey <pramsey at cleverelephant.ca>
+*
+*------------------------------------------------------------------------*/
+
+
+#include <stdlib.h>
+#include "liblwgeom_internal.h"
+
+/*
+* Simple k-means implementation for arbitrary data structures
+*
+* Since k-means partitions based on inter-object "distance" the same
+* machinery can be used to support any object type that can calculate a
+* "distance" between pairs.
+*
+* To use the k-means infrastructure, just fill out the kmeans_config
+* structure and invoke the kmeans() function.
+*/
+
+/*
+* Threaded calculation is available using pthreads, which practically
+* means UNIX platforms only, unless you're building with a posix
+* compatible environment.
+*
+* #define KMEANS_THREADED
+*/
+
+/*
+* When clustering lists with NULL elements, they will get this as
+* their cluster number. (All the other clusters will be non-negative)
+*/
+#define KMEANS_NULL_CLUSTER -1
+
+/*
+* If the algorithm doesn't converge within this number of iterations,
+* it will return with a failure error code.
+*/
+#define KMEANS_MAX_ITERATIONS 1000
+
+/*
+* The code doesn't try to figure out how many threads to use, so
+* best to set this to the number of cores you expect to have
+* available. The threshold is the the value of k*n at which to
+* move to multi-threading.
+*/
+#ifdef KMEANS_THREADED
+#define KMEANS_THR_MAX 4
+#define KMEANS_THR_THRESHOLD 250000
+#endif
+
+#define kmeans_malloc(size) lwalloc(size)
+#define kmeans_free(ptr) lwfree(ptr)
+
+typedef void * Pointer;
+
+typedef enum {
+ KMEANS_OK,
+ KMEANS_EXCEEDED_MAX_ITERATIONS,
+ KMEANS_ERROR
+} kmeans_result;
+
+/*
+* Prototype for the distance calculating function
+*/
+typedef double (*kmeans_distance_method) (const Pointer a, const Pointer b);
+
+/*
+* Prototype for the centroid calculating function
+* @param objs the list of all objects in the calculation
+* @param clusters the list of cluster numbers for each object
+* @param num_objs the number of objects/cluster numbers in the previous arrays
+* @param cluster the cluster number we are actually generating a centroid for here
+* @param centroid the object to write the centroid result into (already allocated)
+*/
+typedef void (*kmeans_centroid_method) (const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid);
+
+typedef struct kmeans_config
+{
+ /* Function returns the "distance" between any pair of objects */
+ kmeans_distance_method distance_method;
+
+ /* Function returns the "centroid" of a collection of objects */
+ kmeans_centroid_method centroid_method;
+
+ /* An array of objects to be analyzed. User allocates this array */
+ /* and is responsible for freeing it. */
+ /* For objects that are not capable of participating in the distance */
+ /* calculations, but for which you still want included in the process */
+ /* (for examples, database nulls, or geometry empties) use a NULL */
+ /* value in this list. All NULL values will be returned in the */
+ /* KMEANS_NULL_CLUSTER. */
+ Pointer * objs;
+
+ /* Number of objects in the preceding array */
+ size_t num_objs;
+
+ /* An array of inital centers for the algorithm */
+ /* Can be randomly assigned, or using proportions, */
+ /* unfortunately the algorithm is sensitive to starting */
+ /* points, so using a "better" set of starting points */
+ /* might be wise. User allocates and is responsible for freeing. */
+ Pointer * centers;
+
+ /* Number of means we are calculating, length of preceding array */
+ unsigned int k;
+
+ /* Maximum number of times to iterate the algorithm, or 0 for */
+ /* library default */
+ unsigned int max_iterations;
+
+ /* Iteration counter */
+ unsigned int total_iterations;
+
+ /* Array to fill in with cluster numbers. User allocates and frees. */
+ int * clusters;
+
+} kmeans_config;
+
+/* This is where the magic happens. */
+kmeans_result kmeans(kmeans_config *config);
+
Modified: trunk/liblwgeom/liblwgeom.h.in
===================================================================
--- trunk/liblwgeom/liblwgeom.h.in 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/liblwgeom.h.in 2016-02-24 10:44:11 UTC (rev 14673)
@@ -2143,6 +2143,7 @@
LWGEOM *lwgeom_intersection(const LWGEOM *geom1, const LWGEOM *geom2);
LWGEOM *lwgeom_difference(const LWGEOM *geom1, const LWGEOM *geom2);
LWGEOM *lwgeom_symdifference(const LWGEOM* geom1, const LWGEOM* geom2);
+LWGEOM *lwgeom_centroid(const LWGEOM* geom);
LWGEOM *lwgeom_union(const LWGEOM *geom1, const LWGEOM *geom2);
LWGEOM *lwgeom_linemerge(const LWGEOM *geom1);
LWGEOM *lwgeom_unaryunion(const LWGEOM *geom1);
@@ -2279,5 +2280,16 @@
*/
LWGEOM* lwgeom_voronoi_diagram(const LWGEOM* g, const GBOX* env, double tolerance, int output_edges);
+/**
+* Take a list of LWGEOMs and a number of clusters and return an integer
+* array indicating which cluster each geometry is in.
+*
+* @param geoms the input array of LWGEOM pointers
+* @param ngeoms the number of elements in the array
+* @param k the number of clusters to calculate
+*/
+int * lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, int ngeoms, int k);
+
+
#endif /* !defined _LIBLWGEOM_H */
Modified: trunk/liblwgeom/lwgeom_geos.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos.c 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/lwgeom_geos.c 2016-02-24 10:44:11 UTC (rev 14673)
@@ -866,6 +866,60 @@
return result;
}
+LWGEOM *
+lwgeom_centroid(const LWGEOM* geom)
+{
+ GEOSGeometry *g, *g_centroid;
+ LWGEOM *centroid;
+ int srid, is3d;
+
+ if (lwgeom_is_empty(geom))
+ {
+ LWPOINT *lwp = lwpoint_construct_empty(
+ lwgeom_get_srid(geom),
+ lwgeom_has_z(geom),
+ lwgeom_has_m(geom));
+ return lwpoint_as_lwgeom(lwp);
+ }
+
+ srid = lwgeom_get_srid(geom);
+ is3d = lwgeom_has_z(geom);
+
+ initGEOS(lwnotice, lwgeom_geos_error);
+
+ g = LWGEOM2GEOS(geom, 0);
+
+ if (0 == g) /* exception thrown at construction */
+ {
+ lwerror("Geometry could not be converted to GEOS: %s", lwgeom_geos_errmsg);
+ return NULL;
+ }
+
+ g_centroid = GEOSGetCentroid(g);
+ GEOSGeom_destroy(g);
+
+ if (g_centroid == NULL)
+ {
+ lwerror("GEOSGetCentroid: %s", lwgeom_geos_errmsg);
+ return NULL; /*never get here */
+ }
+
+ LWDEBUGF(3, "result: %s", GEOSGeomToWKT(g_centroid));
+
+ GEOSSetSRID(g_centroid, srid);
+
+ centroid = GEOS2LWGEOM(g_centroid, is3d);
+ GEOSGeom_destroy(g_centroid);
+
+ if (centroid == NULL)
+ {
+ lwerror("GEOS GEOSGetCentroid() threw an error (result postgis geometry formation)!");
+ return NULL ; /*never get here */
+ }
+
+ return centroid;
+}
+
LWGEOM*
lwgeom_union(const LWGEOM *geom1, const LWGEOM *geom2)
{
Added: trunk/liblwgeom/lwkmeans.c
===================================================================
--- trunk/liblwgeom/lwkmeans.c (rev 0)
+++ trunk/liblwgeom/lwkmeans.c 2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,231 @@
+#include <float.h>
+#include <math.h>
+
+#include "kmeans.h"
+#include "liblwgeom_internal.h"
+
+
+static double lwkmeans_pt_distance(const Pointer a, const Pointer b)
+{
+ POINT2D *pa = (POINT2D*)a;
+ POINT2D *pb = (POINT2D*)b;
+
+ double dx = (pa->x - pb->x);
+ double dy = (pa->y - pb->y);
+
+ return dx*dx + dy*dy;
+}
+
+static int lwkmeans_pt_closest(const Pointer * objs, size_t num_objs, const Pointer a)
+{
+ int i;
+ double d, d_closest;
+ double closest;
+
+ assert(num_objs>0);
+
+ d_closest = lwkmeans_pt_distance(objs[0], a);
+ closest = 0;
+
+ for (i = 1; i < num_objs; i++)
+ {
+ d = lwkmeans_pt_distance(objs[i], a);
+ if (d < d_closest)
+ {
+ d_closest = d;
+ closest = i;
+ }
+ }
+
+ return closest;
+}
+
+static void lwkmeans_pt_centroid(const Pointer * objs, const int * clusters, size_t num_objs, int cluster, Pointer centroid)
+{
+ int i;
+ int num_cluster = 0;
+ POINT2D sum;
+ POINT2D **pts = (POINT2D**)objs;
+ POINT2D *center = (POINT2D*)centroid;
+
+ sum.x = sum.y = 0.0;
+
+ if (num_objs <= 0) return;
+
+ for (i = 0; i < num_objs; i++)
+ {
+ /* Skip points that are not of interest */
+ if (clusters[i] != cluster) continue;
+
+ sum.x += pts[i]->x;
+ sum.y += pts[i]->y;
+ num_cluster++;
+ }
+ if (num_cluster)
+ {
+ sum.x /= num_cluster;
+ sum.y /= num_cluster;
+ *center = sum;
+ }
+ return;
+}
+
+
+int *
+lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, int ngeoms, int k)
+{
+ int i;
+ int num_centroids = 0;
+ LWGEOM **centroids;
+ POINT2D *centers_raw;
+ const POINT2D *cp;
+ POINT2D min, max;
+ double dx, dy;
+ kmeans_config config;
+ kmeans_result result;
+ int *seen;
+ int sidx = 0;
+
+ assert(k>0);
+ assert(ngeoms>0);
+ assert(geoms);
+
+ /* We'll hold the temporary centroid objects here */
+ centroids = lwalloc(sizeof(LWGEOM*) * ngeoms);
+
+ /* The vector of cluster means. We have to allocate a */
+ /* chunk of memory for these because we'll be mutating them */
+ /* in the kmeans algorithm */
+ centers_raw = lwalloc(sizeof(POINT2D) * k);
+
+ /* K-means configuration setup */
+ config.objs = lwalloc(sizeof(Pointer) * ngeoms);
+ config.num_objs = ngeoms;
+ config.clusters = lwalloc(sizeof(int) * ngeoms);
+ config.centers = lwalloc(sizeof(Pointer) * k);
+ config.k = k;
+ config.max_iterations = 0;
+ config.distance_method = lwkmeans_pt_distance;
+ config.centroid_method = lwkmeans_pt_centroid;
+
+ /* Clean the memory */
+ memset(config.objs, 0, sizeof(Pointer) * ngeoms);
+ memset(config.clusters, 0, sizeof(int) * ngeoms);
+ memset(config.centers, 0, sizeof(Pointer) * k);
+
+ /* Prepare the list of object pointers for K-means */
+ for (i = 0; i < ngeoms; i++)
+ {
+ const LWGEOM *geom = geoms[i];
+ LWPOINT *lwpoint;
+
+ /* Null/empty geometries get a NULL pointer */
+ if ((!geom) || lwgeom_is_empty(geom))
+ {
+ config.objs[i] = NULL;
+ continue;
+ }
+
+ /* If the input is a point, use its coordinates */
+ /* If its not a point, convert it to one via centroid */
+ if (lwgeom_get_type(geom) != POINTTYPE)
+ {
+ LWGEOM *centroid = lwgeom_centroid(geom);
+ if ((!centroid) || lwgeom_is_empty(centroid))
+ {
+ config.objs[i] = NULL;
+ continue;
+ }
+ centroids[num_centroids++] = centroid;
+ lwpoint = lwgeom_as_lwpoint(centroid);
+ }
+ else
+ {
+ lwpoint = lwgeom_as_lwpoint(geom);
+ }
+
+ /* Store a pointer to the POINT2D we are interested in */
+ cp = getPoint2d_cp(lwpoint->point, 0);
+ config.objs[i] = (Pointer)cp;
+
+ /* Since we're already here, let's calculate the extrema of the set */
+ if (i == 0)
+ {
+ min = *cp;
+ max = *cp;
+ }
+ else
+ {
+ if (cp->x < min.x) min.x = cp->x;
+ if (cp->y < min.y) min.y = cp->y;
+ if (cp->x > max.x) max.x = cp->x;
+ if (cp->y > max.y) max.y = cp->y;
+ }
+ }
+
+ /*
+ * We map a uniform assignment of points in the area covered by the set
+ * onto actual points in the set
+ */
+ dx = (max.x - min.x)/k;
+ dy = (max.y - min.y)/k;
+ seen = lwalloc(sizeof(int)*config.k);
+ for (i = 0; i < k; i++)
+ {
+ int closest;
+ POINT2D p;
+ int j;
+
+ /* Calculate a point in the range */
+ p.x = min.x + dx * (i + 0.5);
+ p.y = min.y + dy * (i + 0.5);
+
+ /* Find the data point closest to the calculated point */
+ closest = lwkmeans_pt_closest(config.objs, config.num_objs, &p);
+
+ /* Ensure we aren't already using that point as a seed */
+ j = 0;
+ while(j < sidx)
+ {
+ if (seen[j] == closest)
+ {
+ closest = (closest + 1) % config.num_objs;
+ }
+ else
+ {
+ j++;
+ }
+ }
+ seen[sidx++] = closest;
+
+ /* Copy the point coordinates into the initial centers array */
+ /* This is ugly, but the centers array is an array of */
+ /* pointers to points, not an array of points */
+ centers_raw[i] = *((POINT2D*)config.objs[closest]);
+ config.centers[i] = &(centers_raw[i]);
+ }
+
+ result = kmeans(&config);
+
+ /* Before error handling, might as well clean up all the inputs */
+ lwfree(config.objs);
+ lwfree(config.centers);
+ lwfree(centers_raw);
+ lwfree(centroids);
+
+ /* Good result */
+ if (result == KMEANS_OK)
+ return config.clusters;
+
+ /* Bad result, not going to need the answer */
+ lwfree(config.clusters);
+ if (result == KMEANS_EXCEEDED_MAX_ITERATIONS)
+ {
+ lwerror("%s did not converge after %d iterations", __func__, config.max_iterations);
+ return NULL;
+ }
+
+ /* Unknown error */
+ return NULL;
+}
+
Modified: trunk/liblwgeom/stringbuffer.c
===================================================================
--- trunk/liblwgeom/stringbuffer.c 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/stringbuffer.c 2016-02-24 10:44:11 UTC (rev 14673)
@@ -37,6 +37,27 @@
return stringbuffer_create_with_size(STRINGBUFFER_STARTSIZE);
}
+static void
+stringbuffer_init_with_size(stringbuffer_t *s, size_t size)
+{
+ s->str_start = lwalloc(size);
+ s->str_end = s->str_start;
+ s->capacity = size;
+ memset(s->str_start, 0, size);
+}
+
+void
+stringbuffer_release(stringbuffer_t *s)
+{
+ if ( s->str_start ) lwfree(s->str_start);
+}
+
+void
+stringbuffer_init(stringbuffer_t *s)
+{
+ stringbuffer_init_with_size(s, STRINGBUFFER_STARTSIZE);
+}
+
/**
* Allocate a new stringbuffer_t. Use stringbuffer_destroy to free.
*/
@@ -46,10 +67,7 @@
stringbuffer_t *s;
s = lwalloc(sizeof(stringbuffer_t));
- s->str_start = lwalloc(size);
- s->str_end = s->str_start;
- s->capacity = size;
- memset(s->str_start,0,size);
+ stringbuffer_init_with_size(s, size);
return s;
}
@@ -59,7 +77,7 @@
void
stringbuffer_destroy(stringbuffer_t *s)
{
- if ( s->str_start ) lwfree(s->str_start);
+ stringbuffer_release(s);
if ( s ) lwfree(s);
}
Modified: trunk/liblwgeom/stringbuffer.h
===================================================================
--- trunk/liblwgeom/stringbuffer.h 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/liblwgeom/stringbuffer.h 2016-02-24 10:44:11 UTC (rev 14673)
@@ -44,6 +44,8 @@
extern stringbuffer_t *stringbuffer_create_with_size(size_t size);
extern stringbuffer_t *stringbuffer_create(void);
+extern void stringbuffer_init(stringbuffer_t *s);
+extern void stringbuffer_release(stringbuffer_t *s);
extern void stringbuffer_destroy(stringbuffer_t *sb);
extern void stringbuffer_clear(stringbuffer_t *sb);
void stringbuffer_set(stringbuffer_t *sb, const char *s);
Modified: trunk/postgis/Makefile.in
===================================================================
--- trunk/postgis/Makefile.in 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/postgis/Makefile.in 2016-02-24 10:44:11 UTC (rev 14673)
@@ -83,6 +83,7 @@
lwgeom_sqlmm.o \
lwgeom_rtree.o \
lwgeom_transform.o \
+ lwgeom_window.o \
gserialized_typmod.o \
gserialized_gist_2d.o \
gserialized_gist_nd.o \
Added: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c (rev 0)
+++ trunk/postgis/lwgeom_window.c 2016-02-24 10:44:11 UTC (rev 14673)
@@ -0,0 +1,132 @@
+/**********************************************************************
+ *
+ * PostGIS - Spatial Types for PostgreSQL
+ * http://postgis.net
+ *
+ * PostGIS is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * PostGIS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
+ *
+ **********************************************************************
+ *
+ * Copyright 2016 Paul Ramsey <pramsey at cleverelephant.ca>
+ *
+ **********************************************************************/
+
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "windowapi.h"
+
+#include "../postgis_config.h"
+
+#include "liblwgeom.h"
+#include "lwgeom_pg.h"
+
+extern Datum ST_KMeans(PG_FUNCTION_ARGS);
+
+typedef struct{
+ bool isdone;
+ bool isnull;
+ int result[1];
+ /* variable length */
+} kmeans_context;
+
+PG_FUNCTION_INFO_V1(ST_KMeans);
+Datum ST_KMeans(PG_FUNCTION_ARGS)
+{
+ WindowObject winobj = PG_WINDOW_OBJECT();
+ kmeans_context *context;
+ int64 curpos, rowcount;
+
+ rowcount = WinGetPartitionRowCount(winobj);
+ context = (kmeans_context *)
+ WinGetPartitionLocalMemory(winobj,
+ sizeof(kmeans_context) + sizeof(int) * rowcount);
+
+ if (!context->isdone)
+ {
+ int i, k, N;
+ bool isnull, isout;
+ LWGEOM **geoms;
+ int *r;
+
+ /* What is K? If it's NULL or invalid, we can't procede */
+ k = DatumGetInt32(WinGetFuncArgCurrent(winobj, 1, &isnull));
+ if (isnull || k <= 0)
+ {
+ context->isdone = true;
+ context->isnull = true;
+ PG_RETURN_NULL();
+ }
+
+ /* We also need a non-zero N */
+ N = (int) WinGetPartitionRowCount(winobj);
+ if (N <= 0)
+ {
+ context->isdone = true;
+ context->isnull = true;
+ PG_RETURN_NULL();
+ }
+
+ /* Read all the geometries from the partition window into a list */
+ geoms = palloc(sizeof(LWGEOM*) * N);
+ for (i = 0; i < N; i++)
+ {
+ GSERIALIZED *g;
+ Datum arg = WinGetFuncArgInPartition(winobj, 0, i,
+ WINDOW_SEEK_HEAD, false, &isnull, &isout);
+
+ /* Null geometries are entered as NULL pointers */
+ if (isnull)
+ {
+ geoms[i] = NULL;
+ continue;
+ }
+
+ g = (GSERIALIZED*)PG_DETOAST_DATUM_COPY(arg);
+ geoms[i] = lwgeom_from_gserialized(g);
+ }
+
+ /* Calculate k-means on the list! */
+ r = lwgeom_cluster_2d_kmeans((const LWGEOM **)geoms, N, k);
+
+ /* Clean up */
+ for (i = 0; i < N; i++)
+ if (geoms[i])
+ lwgeom_free(geoms[i]);
+
+ pfree(geoms);
+
+ if (!r)
+ {
+ context->isdone = true;
+ context->isnull = true;
+ PG_RETURN_NULL();
+ }
+
+ /* Safe the result */
+ memcpy(context->result, r, sizeof(int) * N);
+ pfree(r);
+ context->isdone = true;
+ }
+
+ if (context->isnull)
+ PG_RETURN_NULL();
+
+ curpos = WinGetCurrentPosition(winobj);
+ PG_RETURN_INT32(context->result[curpos]);
+}
Modified: trunk/postgis/postgis.sql.in
===================================================================
--- trunk/postgis/postgis.sql.in 2016-02-23 17:10:40 UTC (rev 14672)
+++ trunk/postgis/postgis.sql.in 2016-02-24 10:44:11 UTC (rev 14673)
@@ -3782,9 +3782,15 @@
);
-
--------------------------------------------------------------------------------
+-- Availability: 2.3.0
+CREATE FUNCTION ST_KMeans(geom geometry, k integer)
+ RETURNS integer
+ AS 'MODULE_PATHNAME', 'ST_KMeans'
+ LANGUAGE 'c' VOLATILE STRICT WINDOW;
+
+
-- Availability: 1.2.2
CREATE OR REPLACE FUNCTION ST_Relate(geom1 geometry, geom2 geometry)
RETURNS text
More information about the postgis-tickets
mailing list