[postgis-tickets] r14674 - #3362, ST_ClusterDBSCAN

Daniel Baston dbaston at gmail.com
Wed Feb 24 03:21:30 PST 2016


Author: dbaston
Date: 2016-02-24 03:21:30 -0800 (Wed, 24 Feb 2016)
New Revision: 14674

Modified:
   trunk/NEWS
   trunk/liblwgeom/cunit/cu_unionfind.c
   trunk/liblwgeom/lwgeom_geos.h
   trunk/liblwgeom/lwgeom_geos_cluster.c
   trunk/liblwgeom/lwunionfind.c
   trunk/liblwgeom/lwunionfind.h
   trunk/postgis/lwgeom_geos.c
   trunk/postgis/lwgeom_window.c
   trunk/postgis/postgis.sql.in
   trunk/regress/cluster.sql
   trunk/regress/cluster_expected
Log:
#3362, ST_ClusterDBSCAN

Modified: trunk/NEWS
===================================================================
--- trunk/NEWS	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/NEWS	2016-02-24 11:21:30 UTC (rev 14674)
@@ -11,6 +11,7 @@
   - populate_topology_layer (Sandro Santilli)
   - #2259 ST_Voronoi (Dan Baston)
   - #3339 ST_GeneratePoints (Paul Ramsey)
+  - #3362 ST_ClusterDBSCAN (Dan Baston) 
   - #3428 ST_Points (Dan Baston)
   - #3465 ST_KMeans (Paul Ramsey)
 

Modified: trunk/liblwgeom/cunit/cu_unionfind.c
===================================================================
--- trunk/liblwgeom/cunit/cu_unionfind.c	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/liblwgeom/cunit/cu_unionfind.c	2016-02-24 11:21:30 UTC (rev 14674)
@@ -110,6 +110,31 @@
 	UF_destroy(uf);
 }
 
+static void test_unionfind_collapse_cluster_ids(void)
+{
+	UNIONFIND* uf = UF_create(10);
+	
+	uf->clusters[0] = 8;
+	uf->clusters[1] = 5;
+	uf->clusters[2] = 5;
+	uf->clusters[3] = 5;
+	uf->clusters[4] = 7;
+	uf->clusters[5] = 5;
+	uf->clusters[6] = 8;
+	uf->clusters[7] = 7;
+	uf->clusters[8] = 8;
+	uf->clusters[9] = 7;
+
+	/* 5 -> 0
+	 * 7 -> 1
+	 * 8 -> 2
+	 */
+	uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
+	uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf);
+
+	CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+}
+
 void unionfind_suite_setup(void);
 void unionfind_suite_setup(void)
 {
@@ -118,4 +143,5 @@
 	PG_ADD_TEST(suite, test_unionfind_union);
 	PG_ADD_TEST(suite, test_unionfind_ordered_by_cluster);
 	PG_ADD_TEST(suite, test_unionfind_path_compression);
+	PG_ADD_TEST(suite, test_unionfind_collapse_cluster_ids);
 }

Modified: trunk/liblwgeom/lwgeom_geos.h
===================================================================
--- trunk/liblwgeom/lwgeom_geos.h	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/liblwgeom/lwgeom_geos.h	2016-02-24 11:21:30 UTC (rev 14674)
@@ -31,8 +31,8 @@
 #endif
 
 #include "liblwgeom.h"
+#include "lwunionfind.h"
 
-
 /*
 ** Public prototypes for GEOS utility functions.
 */
@@ -43,6 +43,8 @@
 
 int cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** clusterGeoms, uint32_t* num_clusters);
 int cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
+int cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
+int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points);
 
 POINTARRAY *ptarray_from_GEOSCoordSeq(const GEOSCoordSequence *cs, char want3d);
 

Modified: trunk/liblwgeom/lwgeom_geos_cluster.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos_cluster.c	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/liblwgeom/lwgeom_geos_cluster.c	2016-02-24 11:21:30 UTC (rev 14674)
@@ -31,26 +31,14 @@
 
 static const int STRTREE_NODE_CAPACITY = 10;
 
-/* Utility struct used to pass information to the GEOSSTRtree_query callback */
-struct UnionIfIntersectingContext
+/* Utility struct used to accumulate geometries in GEOSSTRtree_query callback */
+struct QueryContext
 {
-	UNIONFIND* uf;
-	char error;
-	uint32_t* p;
-	const GEOSPreparedGeometry* prep;
-	GEOSGeometry** geoms;
+	void** items_found;
+	uint32_t items_found_size;
+	uint32_t num_items_found;
 };
 
-/* Utility struct used to pass information to the GEOSSTRtree_query callback */
-struct UnionIfDWithinContext
-{
-	UNIONFIND* uf;
-	char error;
-	uint32_t* p;
-	LWGEOM** geoms;
-	double tolerance;
-};
-
 /* Utility struct to keep GEOSSTRtree and associated structures to be freed after use */
 struct STRTree
 {
@@ -62,8 +50,6 @@
 
 static struct STRTree make_strtree(void** geoms, uint32_t num_geoms, char is_lwgeom);
 static void destroy_strtree(struct STRTree tree);
-static void union_if_intersecting(void* item, void* userdata);
-static void union_if_dwithin(void* item, void* userdata);
 static int union_intersecting_pairs(GEOSGeometry** geoms, uint32_t num_geoms, UNIONFIND* uf);
 static int combine_geometries(UNIONFIND* uf, void** geoms, uint32_t num_geoms, void*** clustersGeoms, uint32_t* num_clusters, char is_lwgeom);
 
@@ -91,16 +77,16 @@
 		}
 		else
 		{
-            const GBOX* box = lwgeom_get_bbox(geoms[i]);
-            if (box)
-            {
-                tree.envelopes[i] = GBOX2GEOS(box);
-            } 
-            else
-            {
-                /* Empty geometry */
-                tree.envelopes[i] = GEOSGeom_createEmptyPolygon();
-            }
+			const GBOX* box = lwgeom_get_bbox(geoms[i]);
+			if (box)
+			{
+				tree.envelopes[i] = GBOX2GEOS(box);
+			} 
+			else
+			{
+				/* Empty geometry */
+				tree.envelopes[i] = GEOSGeom_createEmptyPolygon();
+			}
 		}
 		GEOSSTRtree_insert(tree.tree, tree.envelopes[i], &(tree.geom_ids[i]));
 	}
@@ -122,184 +108,111 @@
 	lwfree(tree.envelopes);
 }
 
-/* Callback function for GEOSSTRtree_query */
 static void
-union_if_intersecting(void* item, void* userdata)
+query_accumulate(void* item, void* userdata)
 {
-	struct UnionIfIntersectingContext *cxt = userdata;
-	if (cxt->error)
+	struct QueryContext *cxt = userdata;
+	if (!cxt->items_found)
 	{
-		return;
+		cxt->items_found_size = 8;
+		cxt->items_found = lwalloc(cxt->items_found_size * sizeof(void*));
 	}
-	uint32_t q = *((uint32_t*) item);
-	uint32_t p = *(cxt->p);
 
-	if (p != q && UF_find(cxt->uf, p) != UF_find(cxt->uf, q))
+	if (cxt->num_items_found >= cxt->items_found_size)
 	{
-		int geos_type = GEOSGeomTypeId(cxt->geoms[p]);
-		int geos_result;
-
-		/* Don't build prepared a geometry around a Point or MultiPoint -
-		 * there are some problems in the implementation, and it's not clear
-		 * there would be a performance benefit in any case.  (See #3433)
-		 */
-		if (geos_type != GEOS_POINT && geos_type != GEOS_MULTIPOINT)
-		{
-			/* Lazy initialize prepared geometry */
-			if (cxt->prep == NULL)
-			{
-				cxt->prep = GEOSPrepare(cxt->geoms[p]);
-			}
-			geos_result = GEOSPreparedIntersects(cxt->prep, cxt->geoms[q]);
-		}
-		else
-		{
-			geos_result = GEOSIntersects(cxt->geoms[p], cxt->geoms[q]);
-		}
-		if (geos_result > 1)
-		{
-			cxt->error = geos_result;
-			return;
-		}
-		if (geos_result)
-		{
-			UF_union(cxt->uf, p, q);
-		}
+		cxt->items_found_size = 2 * cxt->items_found_size;
+		cxt->items_found = lwrealloc(cxt->items_found, cxt->items_found_size * sizeof(void*));
 	}
+	cxt->items_found[cxt->num_items_found++] = item;
 }
 
-/* Callback function for GEOSSTRtree_query */
-static void
-union_if_dwithin(void* item, void* userdata)
-{
-	struct UnionIfDWithinContext *cxt = userdata;
-	if (cxt->error)
-	{
-		return;
-	}
-	uint32_t q = *((uint32_t*) item);
-	uint32_t p = *(cxt->p);
-
-	if (p != q && UF_find(cxt->uf, p) != UF_find(cxt->uf, q))
-	{
-		double mindist = lwgeom_mindistance2d_tolerance(cxt->geoms[p], cxt->geoms[q], cxt->tolerance);
-		if (mindist == FLT_MAX)
-		{
-			cxt->error = 1;
-			return;
-		}
-
-		if (mindist <= cxt->tolerance)
-		{
-			UF_union(cxt->uf, p, q);
-		}
-	}
-}
-
 /* Identify intersecting geometries and mark them as being in the same set */
 static int
 union_intersecting_pairs(GEOSGeometry** geoms, uint32_t num_geoms, UNIONFIND* uf)
 {
-	uint32_t i;
+	uint32_t p, i;
+	struct STRTree tree;
+	struct QueryContext cxt =
+	{
+		.items_found = NULL,
+		.num_items_found = 0,
+		.items_found_size = 0
+	};
+	int success = LW_SUCCESS;
 
 	if (num_geoms <= 1)
-	{
 		return LW_SUCCESS;
-	}
 
-	struct STRTree tree = make_strtree((void**) geoms, num_geoms, 0);
+	tree = make_strtree((void**) geoms, num_geoms, 0);
 	if (tree.tree == NULL)
 	{
 		destroy_strtree(tree);
 		return LW_FAILURE;
 	}
-	for (i = 0; i < num_geoms; i++)
+
+	for (p = 0; p < num_geoms; p++)
 	{
-        if (GEOSisEmpty(geoms[i]))
-        {
-            continue;
-        }
+		const GEOSPreparedGeometry* prep = NULL;
+		GEOSGeometry* query_envelope;
 
-		struct UnionIfIntersectingContext cxt =
-		{
-			.uf = uf,
-			.error = 0,
-			.p = &i,
-			.prep = NULL,
-			.geoms = geoms
-		};
-		GEOSGeometry* query_envelope = GEOSEnvelope(geoms[i]);
-		GEOSSTRtree_query(tree.tree, query_envelope, &union_if_intersecting, &cxt);
+		if (GEOSisEmpty(geoms[p]))
+			continue;
 
-		GEOSGeom_destroy(query_envelope);
-		GEOSPreparedGeom_destroy(cxt.prep);
-		if (cxt.error)
-		{
-			return LW_FAILURE;
-		}
-	}
+		cxt.num_items_found = 0;
+		query_envelope = GEOSEnvelope(geoms[p]);
+		GEOSSTRtree_query(tree.tree, query_envelope, &query_accumulate, &cxt);
 
-	destroy_strtree(tree);
-	return LW_SUCCESS;
-}
-
-/* Identify geometries within a distance tolerance and mark them as being in the same set */
-static int
-union_pairs_within_distance(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double tolerance)
-{
-	uint32_t i;
-
-	if (num_geoms <= 1)
-	{
-		return LW_SUCCESS;
-	}
-
-	struct STRTree tree = make_strtree((void**) geoms, num_geoms, 1);
-	if (tree.tree == NULL)
-	{
-		destroy_strtree(tree);
-		return LW_FAILURE;
-	}
-
-	for (i = 0; i < num_geoms; i++)
-	{
-		struct UnionIfDWithinContext cxt =
+		for (i = 0; i < cxt.num_items_found; i++)
 		{
-			.uf = uf,
-			.error = 0,
-			.p = &i,
-			.geoms = geoms,
-			.tolerance = tolerance
-		};
+			uint32_t q = *((uint32_t*) cxt.items_found[i]);
 
-        const GBOX* geom_extent = lwgeom_get_bbox(geoms[i]);
-        if (!geom_extent)
-        {
-            /* Empty geometry */
-            continue;
-        }
-		GBOX* query_extent = gbox_clone(geom_extent);
-		gbox_expand(query_extent, tolerance);
-		GEOSGeometry* query_envelope = GBOX2GEOS(query_extent);
+			if (p != q && UF_find(uf, p) != UF_find(uf, q))
+			{
+				int geos_type = GEOSGeomTypeId(geoms[p]);
+				int geos_result;
 
-		if (!query_envelope)
-		{
-			destroy_strtree(tree);
-			return LW_FAILURE;
+				/* Don't build prepared a geometry around a Point or MultiPoint -
+				 * there are some problems in the implementation, and it's not clear
+				 * there would be a performance benefit in any case.  (See #3433)
+				 */
+				if (geos_type != GEOS_POINT && geos_type != GEOS_MULTIPOINT)
+				{
+					/* Lazy initialize prepared geometry */
+					if (prep == NULL)
+					{
+						prep = GEOSPrepare(geoms[p]);
+					}
+					geos_result = GEOSPreparedIntersects(prep, geoms[q]);
+				}
+				else
+				{
+					geos_result = GEOSIntersects(geoms[p], geoms[q]);
+				}
+				if (geos_result > 1)
+				{
+					success = LW_FAILURE;
+					break;
+				}
+				else if (geos_result)
+				{
+					UF_union(uf, p, q);
+				}
+			}
 		}
 
-		GEOSSTRtree_query(tree.tree, query_envelope, &union_if_dwithin, &cxt);
+		if (prep)
+			GEOSPreparedGeom_destroy(prep);
+		GEOSGeom_destroy(query_envelope);
 
-		lwfree(query_extent);
-		GEOSGeom_destroy(query_envelope);
-		if (cxt.error)
-		{
-			return LW_FAILURE;
-		}
+		if (!success)
+			break;
 	}
 
+	if (cxt.items_found)
+		lwfree(cxt.items_found);
+
 	destroy_strtree(tree);
-	return LW_SUCCESS;
+	return success;
 }
 
 /** Takes an array of GEOSGeometry* and constructs an array of GEOSGeometry*, where each element in the constructed
@@ -328,10 +241,109 @@
 int
 cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
 {
+	int min_points = 1;
+	return cluster_dbscan(geoms, num_geoms, tolerance, min_points, clusterGeoms, num_clusters);
+}
+
+
+static int
+dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geoms, uint32_t p, double eps)
+{
+	cxt->num_items_found = 0;
+
+	const GBOX* geom_extent = lwgeom_get_bbox(geoms[p]);
+	GBOX* query_extent = gbox_clone(geom_extent);
+	gbox_expand(query_extent, eps);
+	GEOSGeometry* query_envelope = GBOX2GEOS(query_extent);
+
+	if (!query_envelope)
+		return LW_FAILURE;
+
+	GEOSSTRtree_query(tree, query_envelope, &query_accumulate, cxt);
+
+	lwfree(query_extent);
+	GEOSGeom_destroy(query_envelope);
+
+	return LW_SUCCESS;
+}
+
+int
+union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points)
+{
+	uint32_t p, i;
+	struct STRTree tree;
+	struct QueryContext cxt =
+	{
+		.items_found = NULL,
+		.num_items_found = 0,
+		.items_found_size = 0
+	};
+	int success = LW_SUCCESS;
+
+	if (num_geoms <= 1)
+		return LW_SUCCESS;
+
+	tree = make_strtree((void**) geoms, num_geoms, 1);
+	if (tree.tree == NULL)
+	{
+		destroy_strtree(tree);
+		return LW_FAILURE;
+	}
+
+	for (p = 0; p < num_geoms; p++)
+	{
+		uint32_t num_neighbors = 0;
+		uint32_t* neighbors;
+
+		if (lwgeom_is_empty(geoms[p]))
+			continue;
+
+		dbscan_update_context(tree.tree, &cxt, geoms, p, eps);
+		neighbors = lwalloc(cxt.num_items_found * sizeof(uint32_t*));
+
+		for (i = 0; i < cxt.num_items_found; i++)
+		{
+			uint32_t q = *((uint32_t*) cxt.items_found[i]);
+
+			double mindist = lwgeom_mindistance2d_tolerance(geoms[p], geoms[q], eps);
+			if (mindist == FLT_MAX)
+			{
+				success = LW_FAILURE;
+				break;
+			}
+
+			if (mindist <= eps)
+				neighbors[num_neighbors++] = q;
+		}
+
+		if (num_neighbors >= min_points)
+		{
+			for (i = 0; i < num_neighbors; i++)
+			{
+				UF_union(uf, p, neighbors[i]);
+			}
+		}
+
+		lwfree(neighbors);
+
+		if (!success)
+			break;
+	}
+
+	if (cxt.items_found)
+		lwfree(cxt.items_found);
+
+	destroy_strtree(tree);
+	return success;
+}
+
+int
+cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
+{
 	int cluster_success;
 	UNIONFIND* uf = UF_create(num_geoms);
 
-	if (union_pairs_within_distance(geoms, num_geoms, uf, tolerance) == LW_FAILURE)
+	if (union_dbscan(geoms, num_geoms, uf, eps, min_points) == LW_FAILURE)
 	{
 		UF_destroy(uf);
 		return LW_FAILURE;

Modified: trunk/liblwgeom/lwunionfind.c
===================================================================
--- trunk/liblwgeom/lwunionfind.c	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/liblwgeom/lwunionfind.c	2016-02-24 11:21:30 UTC (rev 14674)
@@ -134,6 +134,33 @@
 	lwfree(cluster_id_ptr_by_elem_id);
 	return ordered_ids;
 }
+
+uint32_t*
+UF_get_collapsed_cluster_ids(UNIONFIND* uf)
+{
+	uint32_t* ordered_components = UF_ordered_by_cluster(uf);
+	uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
+	uint32_t last_old_id, current_new_id, i;
+
+	last_old_id = UF_find(uf, ordered_components[0]);
+	current_new_id = 0;
+	for (i = 0; i < uf->N; i++)
+	{
+		uint32_t j = ordered_components[i];
+		uint32_t current_old_id = UF_find(uf, j);
+
+		if (current_old_id != last_old_id)
+			current_new_id++;
+
+		new_ids[j] = current_new_id;
+		last_old_id = current_old_id;
+	}
+
+	lwfree(ordered_components);
+
+	return new_ids;
+}
+
 static int
 cmp_int(const void *a, const void *b)
 {

Modified: trunk/liblwgeom/lwunionfind.h
===================================================================
--- trunk/liblwgeom/lwunionfind.h	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/liblwgeom/lwunionfind.h	2016-02-24 11:21:30 UTC (rev 14674)
@@ -52,4 +52,7 @@
  * same cluster are contiguous in the array */
 uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
 
+/* Replace the cluster ids in a UNIONFIND with sequential ids starting at zero. */
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf);
+
 #endif

Modified: trunk/postgis/lwgeom_geos.c
===================================================================
--- trunk/postgis/lwgeom_geos.c	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/postgis/lwgeom_geos.c	2016-02-24 11:21:30 UTC (rev 14674)
@@ -3205,6 +3205,7 @@
 	LWGEOM** lw_inputs;
 	LWGEOM** lw_results;
 	double tolerance;
+	int min_points;
 	int srid=SRID_UNKNOWN;
 
 	/* Parameters used to construct a result array */
@@ -3213,13 +3214,31 @@
 	char elmalign;
 
 	/* Null array, null geometry (should be empty?) */
-    if (PG_ARGISNULL(0))
-        PG_RETURN_NULL();
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
 
-    array = PG_GETARG_ARRAYTYPE_P(0);
+	array = PG_GETARG_ARRAYTYPE_P(0);
+
 	tolerance = PG_GETARG_FLOAT8(1);
-    nelems = array_nelems_not_null(array);
+	if (tolerance < 0)
+	{
+		lwpgerror("Tolerance must be a positive number.");
+		PG_RETURN_NULL();
+	}
 
+	if (PG_NARGS() < 3)
+		min_points = 1;
+	else
+		min_points = PG_GETARG_INT32(2);
+
+	if (min_points < 1)
+	{
+		lwpgerror("min_points must be a positive number.");
+		PG_RETURN_NULL();
+	}
+
+	nelems = array_nelems_not_null(array);
+
 	POSTGIS_DEBUGF(3, "cluster_within_distance_garray: number of non-null elements: %d", nelems);
 
 	if ( nelems == 0 ) PG_RETURN_NULL();
@@ -3235,7 +3254,7 @@
 		PG_RETURN_NULL();
 	}
 
-	if (cluster_within_distance(lw_inputs, nelems, tolerance, &lw_results, &nclusters) != LW_SUCCESS)
+	if (cluster_dbscan(lw_inputs, nelems, tolerance, min_points, &lw_results, &nclusters) != LW_SUCCESS)
 	{
 		elog(ERROR, "cluster_within: Error performing clustering");
 		PG_RETURN_NULL();

Modified: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/postgis/lwgeom_window.c	2016-02-24 11:21:30 UTC (rev 14674)
@@ -19,32 +19,146 @@
  **********************************************************************
  *
  * Copyright 2016 Paul Ramsey <pramsey at cleverelephant.ca>
+ * Copyright 2016 Daniel Baston <dbaston at gmail.com>
  *
  **********************************************************************/
 
+#include "../postgis_config.h"
 
+/* PostgreSQL */
 #include "postgres.h"
-
-#include "fmgr.h"
 #include "funcapi.h"
-#include "miscadmin.h"
-#include "utils/builtins.h"
 #include "windowapi.h"
 
-#include "../postgis_config.h"
-
+/* PostGIS */
 #include "liblwgeom.h"
+#include "lwunionfind.h" /* TODO move into liblwgeom.h ? */
+#include "lwgeom_geos.h"
+#include "lwgeom_log.h"
 #include "lwgeom_pg.h"
 
+extern Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS);
 extern Datum ST_KMeans(PG_FUNCTION_ARGS);
 
-typedef struct{
+typedef struct {
 	bool	isdone;
 	bool	isnull;
 	int		result[1];
 	/* variable length */
 } kmeans_context;
 
+typedef struct
+{
+	uint32_t cluster_id;
+	char is_null;        /* NULL may result from a NULL geometry input, or it may be used by 
+							algorithms such as DBSCAN that do not assign all inputs to a
+							cluster. */
+} dbscan_cluster_result;
+
+typedef struct
+{
+	char is_error;
+	dbscan_cluster_result cluster_assignments[1];
+} dbscan_context;
+
+static LWGEOM*
+read_lwgeom_from_partition(WindowObject win_obj, uint32_t i, bool* is_null)
+{
+	GSERIALIZED* g;
+	Datum arg = WinGetFuncArgInPartition(win_obj, 0, i, WINDOW_SEEK_HEAD, false, is_null, NULL);
+
+	if (*is_null) {
+		/* So that the indexes in our clustering input array can match our partition positions,
+		 * 		 * toss an empty point into the clustering inputs, as a pass-through.
+		 * 		 		 * NOTE: this will cause gaps in the output cluster id sequence.
+		  		 		 		 * */
+		return lwpoint_as_lwgeom(lwpoint_construct_empty(0, 0, 0));
+	}
+
+	g = (GSERIALIZED*) PG_DETOAST_DATUM(arg);
+	return lwgeom_from_gserialized(g);
+}
+
+PG_FUNCTION_INFO_V1(ST_ClusterDBSCAN);
+Datum ST_ClusterDBSCAN(PG_FUNCTION_ARGS)
+{
+	WindowObject win_obj = PG_WINDOW_OBJECT();
+	uint32_t row = WinGetCurrentPosition(win_obj);
+	uint32_t ngeoms = WinGetPartitionRowCount(win_obj);	
+	dbscan_context* context = WinGetPartitionLocalMemory(win_obj, sizeof(dbscan_context) + ngeoms * sizeof(dbscan_cluster_result));
+
+	if (row == 0) /* beginning of the partition; do all of the work now */
+	{
+		uint32_t i;
+		uint32_t* result_ids;
+		LWGEOM** geoms;
+		UNIONFIND* uf;
+		bool tolerance_is_null;
+		bool minpoints_is_null;
+		Datum tolerance_datum = WinGetFuncArgCurrent(win_obj, 1, &tolerance_is_null);
+		Datum minpoints_datum = WinGetFuncArgCurrent(win_obj, 2, &minpoints_is_null);
+		double tolerance = DatumGetFloat8(tolerance_datum);
+		int minpoints = DatumGetInt32(minpoints_datum);
+
+		context->is_error = LW_TRUE; /* until proven otherwise */
+
+		/* Validate input parameters */
+		if (tolerance_is_null || tolerance < 0)
+		{
+			lwpgerror("Tolerance must be a positive number", tolerance);
+			PG_RETURN_NULL();
+		}
+		if (minpoints_is_null || minpoints < 0)
+		{
+			lwpgerror("Minpoints must be a positive number", minpoints);
+		}
+
+		initGEOS(lwnotice, lwgeom_geos_error);
+		geoms = lwalloc(ngeoms * sizeof(LWGEOM*));
+		uf = UF_create(ngeoms);
+		for (i = 0; i < ngeoms; i++)
+		{
+			geoms[i] = read_lwgeom_from_partition(win_obj, i, &(context->cluster_assignments[i].is_null));
+
+			if (!geoms[i]) {
+				/* TODO release memory ? */
+				lwpgerror("Error reading geometry.");
+				PG_RETURN_NULL();
+			}
+		}
+
+		if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints) == LW_SUCCESS)
+			context->is_error = LW_FALSE;
+
+		for (i = 0; i < ngeoms; i++)
+		{
+			lwgeom_free(geoms[i]);
+		}
+		lwfree(geoms);
+
+		if (context->is_error)
+		{
+			UF_destroy(uf);
+			lwpgerror("Error during clustering");
+			PG_RETURN_NULL();
+		}
+
+		result_ids = UF_get_collapsed_cluster_ids(uf);
+		for (i = 0; i < ngeoms; i++)
+		{
+			context->cluster_assignments[i].cluster_id = result_ids[i];
+		}
+
+		lwfree(result_ids);
+		UF_destroy(uf);
+	}
+
+	if (context->cluster_assignments[row].is_null)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT32(context->cluster_assignments[row].cluster_id);
+}
+
 PG_FUNCTION_INFO_V1(ST_KMeans);
 Datum ST_KMeans(PG_FUNCTION_ARGS)
 {

Modified: trunk/postgis/postgis.sql.in
===================================================================
--- trunk/postgis/postgis.sql.in	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/postgis/postgis.sql.in	2016-02-24 11:21:30 UTC (rev 14674)
@@ -1648,6 +1648,12 @@
     AS 'MODULE_PATHNAME',  'cluster_within_distance_garray'
     LANGUAGE 'c' IMMUTABLE STRICT;
 
+-- Availability: 2.3
+CREATE OR REPLACE FUNCTION ST_ClusterDBSCAN (geometry, eps float8, minpoints int)
+	RETURNS int
+	AS 'MODULE_PATHNAME', 'ST_ClusterDBSCAN'
+	LANGUAGE 'c' IMMUTABLE STRICT WINDOW;
+
 -- Availability: 1.2.2
 CREATE OR REPLACE FUNCTION ST_LineMerge(geometry)
 	RETURNS geometry
@@ -3678,6 +3684,12 @@
 	AS 'MODULE_PATHNAME'
 	LANGUAGE 'c';
 
+-- Availability: 2.3
+CREATE OR REPLACE FUNCTION pgis_geometry_accum_transfn(pgis_abs, geometry, float8, int)
+	RETURNS pgis_abs
+	AS 'MODULE_PATHNAME'
+	LANGUAGE 'c';
+
 -- Availability: 1.4.0
 CREATE OR REPLACE FUNCTION pgis_geometry_accum_finalfn(pgis_abs)
 	RETURNS geometry[]

Modified: trunk/regress/cluster.sql
===================================================================
--- trunk/regress/cluster.sql	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/regress/cluster.sql	2016-02-24 11:21:30 UTC (rev 14674)
@@ -14,3 +14,24 @@
 SELECT 't2', ST_AsText(unnest(ST_ClusterIntersecting(ST_Accum(geom ORDER BY id)))) FROM cluster_inputs;
 SELECT 't3', ST_AsText(unnest(ST_ClusterWithin(geom, 1.4 ORDER BY id))) FROM cluster_inputs;
 SELECT 't4', ST_AsText(unnest(ST_ClusterWithin(ST_Accum(geom ORDER BY id), 1.5))) FROM cluster_inputs;
+
+-- tests for ST_DBSCAN
+
+CREATE TEMPORARY TABLE dbscan_inputs (id int, geom geometry);
+INSERT INTO dbscan_inputs VALUES
+(1, 'POINT (0 0)'),
+(2, 'POINT (0 1)'),
+(3, 'POINT (-0.5 0.5)'),
+(4, 'POINT (1 0)'),
+(5, 'POINT (1 1)'),
+(6, 'POINT (1.0 0.5)');
+
+/* minpoints = 1, equivalent to ST_ClusterWithin */
+SELECT 't101', id, ST_ClusterDBSCAN(geom, eps := 0.8, minpoints := 1) OVER () from dbscan_inputs;
+
+/* minpoints = 4, no clusters */
+SELECT 't102', id, ST_ClusterDBSCAN(geom, eps := 0.8, minpoints := 4) OVER () from dbscan_inputs;
+
+/* minpoints = 3, but eps too small to form cluster on left */
+SELECT 't103', id, ST_ClusterDBSCAN(geom, eps := 0.6, minpoints := 3) OVER () from dbscan_inputs;
+

Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected	2016-02-24 10:44:11 UTC (rev 14673)
+++ trunk/regress/cluster_expected	2016-02-24 11:21:30 UTC (rev 14674)
@@ -9,3 +9,21 @@
 t3|GEOMETRYCOLLECTION(POLYGON EMPTY)
 t4|GEOMETRYCOLLECTION(LINESTRING(0 0,1 1),LINESTRING(5 5,4 4),LINESTRING(0 0,-1 -1),LINESTRING(6 6,7 7),POLYGON((0 0,4 0,4 4,0 4,0 0)))
 t4|GEOMETRYCOLLECTION(POLYGON EMPTY)
+t101|1|0
+t101|2|0
+t101|3|0
+t101|4|1
+t101|5|1
+t101|6|1
+t102|1|0
+t102|2|1
+t102|3|2
+t102|4|3
+t102|5|4
+t102|6|5
+t103|1|0
+t103|2|1
+t103|3|2
+t103|4|3
+t103|5|3
+t103|6|3



More information about the postgis-tickets mailing list