[postgis-tickets] r14979 - Optimize and reduce memory usage in ST_ClusterDBSCAN / ST_ClusterWithin

Daniel Baston dbaston at gmail.com
Fri Jun 24 18:30:12 PDT 2016


Author: dbaston
Date: 2016-06-24 18:30:12 -0700 (Fri, 24 Jun 2016)
New Revision: 14979

Modified:
   trunk/liblwgeom/lwgeom_geos.c
   trunk/liblwgeom/lwgeom_geos.h
   trunk/liblwgeom/lwgeom_geos_cluster.c
Log:
Optimize and reduce memory usage in ST_ClusterDBSCAN / ST_ClusterWithin

Modified: trunk/liblwgeom/lwgeom_geos.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos.c	2016-06-24 06:34:41 UTC (rev 14978)
+++ trunk/liblwgeom/lwgeom_geos.c	2016-06-25 01:30:12 UTC (rev 14979)
@@ -482,6 +482,44 @@
 	return g;
 }
 
+GEOSGeometry*
+make_geos_point(double x, double y)
+{
+	GEOSCoordSequence* seq = GEOSCoordSeq_create(1, 2);
+	GEOSGeometry* geom = NULL;
+
+	if (!seq)
+		return NULL;
+
+	GEOSCoordSeq_setX(seq, 0, x);
+	GEOSCoordSeq_setY(seq, 0, y);
+
+	geom = GEOSGeom_createPoint(seq);
+	if (!geom)
+		GEOSCoordSeq_destroy(seq);
+	return geom;
+}
+
+GEOSGeometry*
+make_geos_segment(double x1, double y1, double x2, double y2)
+{
+	GEOSCoordSequence* seq = GEOSCoordSeq_create(2, 2);
+	GEOSGeometry* geom = NULL;
+
+	if (!seq)
+		return NULL;
+
+	GEOSCoordSeq_setX(seq, 0, x1);
+	GEOSCoordSeq_setY(seq, 0, y1);
+	GEOSCoordSeq_setX(seq, 1, x2);
+	GEOSCoordSeq_setY(seq, 1, y2);
+
+	geom = GEOSGeom_createLineString(seq);
+	if (!geom)
+		GEOSCoordSeq_destroy(seq);
+	return geom;
+}
+
 const char*
 lwgeom_geos_version()
 {

Modified: trunk/liblwgeom/lwgeom_geos.h
===================================================================
--- trunk/liblwgeom/lwgeom_geos.h	2016-06-24 06:34:41 UTC (rev 14978)
+++ trunk/liblwgeom/lwgeom_geos.h	2016-06-25 01:30:12 UTC (rev 14979)
@@ -41,6 +41,9 @@
 GEOSGeometry * GBOX2GEOS(const GBOX *g);
 GEOSGeometry * LWGEOM_GEOS_buildArea(const GEOSGeometry* geom_in);
 
+GEOSGeometry * make_geos_point(double x, double y);
+GEOSGeometry * make_geos_segment(double x1, double y1, double x2, double y2);
+
 int cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** clusterGeoms, uint32_t* num_clusters);
 int cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
 int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** is_in_cluster_ret);

Modified: trunk/liblwgeom/lwgeom_geos_cluster.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos_cluster.c	2016-06-24 06:34:41 UTC (rev 14978)
+++ trunk/liblwgeom/lwgeom_geos_cluster.c	2016-06-25 01:30:12 UTC (rev 14979)
@@ -18,7 +18,7 @@
  *
  **********************************************************************
  *
- * Copyright 2015 Daniel Baston <dbaston at gmail.com>
+ * Copyright 2015-2016 Daniel Baston <dbaston at gmail.com>
  *
  **********************************************************************/
 
@@ -31,7 +31,7 @@
 
 static const int STRTREE_NODE_CAPACITY = 10;
 
-/* Utility struct used to accumulate geometries in GEOSSTRtree_query callback */
+/* Utility struct used to accumulate items in GEOSSTRtree_query callback */
 struct QueryContext
 {
 	void** items_found;
@@ -49,11 +49,34 @@
 };
 
 static struct STRTree make_strtree(void** geoms, uint32_t num_geoms, char is_lwgeom);
-static void destroy_strtree(struct STRTree tree);
+static void destroy_strtree(struct STRTree * tree);
 static int union_intersecting_pairs(GEOSGeometry** geoms, uint32_t num_geoms, UNIONFIND* uf);
 static int combine_geometries(UNIONFIND* uf, void** geoms, uint32_t num_geoms, void*** clustersGeoms, uint32_t* num_clusters, char is_lwgeom);
 
-/** Make a GEOSSTRtree of either GEOSGeometry* or LWGEOM* pointers */
+/* Make a minimal GEOSGeometry* whose Envelope covers the same 2D extent as
+ * the supplied GBOX.  This is faster and uses less memory than building a
+ * five-point polygon with GBOX2GEOS.
+ */
+static GEOSGeometry*
+geos_envelope_surrogate(const LWGEOM* g)
+{
+	if (lwgeom_is_empty(g))
+		return GEOSGeom_createEmptyPolygon();
+
+	if (lwgeom_get_type(g) == POINTTYPE) {
+		const POINT2D* pt = getPoint2d_cp(lwgeom_as_lwpoint(g)->point, 0);
+		return make_geos_point(pt->x, pt->y);
+	} else {
+		const GBOX* box = lwgeom_get_bbox(g);
+		if (!box)
+			return NULL;
+
+		return make_geos_segment(box->xmin, box->ymin, box->xmax, box->ymax);
+	}
+}
+
+/** Make a GEOSSTRtree that stores a pointer to a variable containing 
+ *  the array index of the input geoms */
 static struct STRTree
 make_strtree(void** geoms, uint32_t num_geoms, char is_lwgeom)
 {
@@ -63,49 +86,50 @@
 	{
 		return tree;
 	}
-	tree.envelopes = lwalloc(num_geoms * sizeof(GEOSGeometry*));
 	tree.geom_ids  = lwalloc(num_geoms * sizeof(uint32_t));
 	tree.num_geoms = num_geoms;
 
-	size_t i;
-	for (i = 0; i < num_geoms; i++)
+	if (is_lwgeom)
 	{
-		tree.geom_ids[i] = i;
-		if (!is_lwgeom)
+		uint32_t i;
+		tree.envelopes = lwalloc(num_geoms * sizeof(GEOSGeometry*));
+		for (i = 0; i < num_geoms; i++)
 		{
-			tree.envelopes[i] = GEOSEnvelope(geoms[i]);
+			tree.geom_ids[i] = i;
+			tree.envelopes[i] = geos_envelope_surrogate(geoms[i]);
+			GEOSSTRtree_insert(tree.tree, tree.envelopes[i], &(tree.geom_ids[i]));
 		}
-		else
+	}
+	else
+	{
+		uint32_t i;
+		tree.envelopes = NULL;
+		for (i = 0; i < num_geoms; i++)
 		{
-			const GBOX* box = lwgeom_get_bbox(geoms[i]);
-			if (box)
-			{
-				tree.envelopes[i] = GBOX2GEOS(box);
-			}
-			else
-			{
-				/* Empty geometry */
-				tree.envelopes[i] = GEOSGeom_createEmptyPolygon();
-			}
+			tree.geom_ids[i] = i;
+			GEOSSTRtree_insert(tree.tree, geoms[i], &(tree.geom_ids[i]));
 		}
-		GEOSSTRtree_insert(tree.tree, tree.envelopes[i], &(tree.geom_ids[i]));
 	}
+
 	return tree;
 }
 
 /** Clean up STRTree after use */
 static void
-destroy_strtree(struct STRTree tree)
+destroy_strtree(struct STRTree * tree)
 {
 	size_t i;
-	GEOSSTRtree_destroy(tree.tree);
+	GEOSSTRtree_destroy(tree->tree);
 
-	for (i = 0; i < tree.num_geoms; i++)
+	if (tree->envelopes)
 	{
-		GEOSGeom_destroy(tree.envelopes[i]);
+		for (i = 0; i < tree->num_geoms; i++)
+		{
+			GEOSGeom_destroy(tree->envelopes[i]);
+		}
+		lwfree(tree->envelopes);
 	}
-	lwfree(tree.geom_ids);
-	lwfree(tree.envelopes);
+	lwfree(tree->geom_ids);
 }
 
 static void
@@ -143,24 +167,22 @@
 	if (num_geoms <= 1)
 		return LW_SUCCESS;
 
-	tree = make_strtree((void**) geoms, num_geoms, 0);
+	tree = make_strtree((void**) geoms, num_geoms, LW_FALSE);
 	if (tree.tree == NULL)
 	{
-		destroy_strtree(tree);
+		destroy_strtree(&tree);
 		return LW_FAILURE;
 	}
 
 	for (p = 0; p < num_geoms; p++)
 	{
 		const GEOSPreparedGeometry* prep = NULL;
-		GEOSGeometry* query_envelope;
 
 		if (GEOSisEmpty(geoms[p]))
 			continue;
 
 		cxt.num_items_found = 0;
-		query_envelope = GEOSEnvelope(geoms[p]);
-		GEOSSTRtree_query(tree.tree, query_envelope, &query_accumulate, &cxt);
+		GEOSSTRtree_query(tree.tree, geoms[p], &query_accumulate, &cxt);
 
 		for (i = 0; i < cxt.num_items_found; i++)
 		{
@@ -202,7 +224,6 @@
 
 		if (prep)
 			GEOSPreparedGeom_destroy(prep);
-		GEOSGeom_destroy(query_envelope);
 
 		if (!success)
 			break;
@@ -211,7 +232,7 @@
 	if (cxt.items_found)
 		lwfree(cxt.items_found);
 
-	destroy_strtree(tree);
+	destroy_strtree(&tree);
 	return success;
 }
 
@@ -235,31 +256,63 @@
 	return cluster_success;
 }
 
-
 static int
 dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geoms, uint32_t p, double eps)
 {
 	cxt->num_items_found = 0;
 
-	const GBOX* geom_extent = lwgeom_get_bbox(geoms[p]);
-	GBOX* query_extent = gbox_clone(geom_extent);
-	gbox_expand(query_extent, eps);
-	GEOSGeometry* query_envelope = GBOX2GEOS(query_extent);
+	GEOSGeometry* query_envelope;
+	if (geoms[p]->type == POINTTYPE)
+	{
+		const POINT2D* pt = getPoint2d_cp(lwgeom_as_lwpoint(geoms[p])->point, 0);
+		query_envelope = make_geos_segment( pt->x - eps, pt->y - eps, pt->x + eps, pt->y + eps );
+	} else {
+		const GBOX* box = lwgeom_get_bbox(geoms[p]);
+		query_envelope = make_geos_segment( box->xmin - eps, box->ymin - eps, box->xmax + eps, box->ymax + eps );
+	}
 
 	if (!query_envelope)
 		return LW_FAILURE;
 
 	GEOSSTRtree_query(tree, query_envelope, &query_accumulate, cxt);
 
-	lwfree(query_extent);
 	GEOSGeom_destroy(query_envelope);
 
 	return LW_SUCCESS;
 }
 
-int
-union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
+/* Union p's cluster with q's cluster, if q is not a border point of another cluster.
+ * Applicable to DBSCAN with minpoints > 1.
+ */
+static void
+union_if_available(UNIONFIND* uf, uint32_t p, uint32_t q, char* is_in_core, char* in_a_cluster)
 {
+	if (in_a_cluster[q])
+	{
+		/* Can we merge p's cluster with q's cluster?  We can do this only
+		 * if both p and q are considered _core_ points of their respective
+		 * clusters.
+		 */
+		 if (is_in_core[q])
+		 {
+			 UF_union(uf, p, q);
+		 }
+	}
+	else
+	{
+		UF_union(uf, p, q);
+		in_a_cluster[q] = LW_TRUE;
+	}
+}
+
+/* An optimized DBSCAN union for the case where min_points == 1.
+ * If min_points == 1, then we don't care how many neighbors we find; we can union clusters
+ * on the fly, as as we go through the distance calculations.  This potentially allows us
+ * to avoid some distance computations altogether.
+ */
+static int
+union_dbscan_minpoints_1(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, char** in_a_cluster_ret)
+{
 	uint32_t p, i;
 	struct STRTree tree;
 	struct QueryContext cxt =
@@ -269,16 +322,82 @@
 		.items_found_size = 0
 	};
 	int success = LW_SUCCESS;
+
+	if (num_geoms <= 1)
+		return LW_SUCCESS;
+
+	tree = make_strtree((void**) geoms, num_geoms, LW_TRUE);
+	if (tree.tree == NULL)
+	{
+		destroy_strtree(&tree);
+		return LW_FAILURE;
+	}
+
+	for (p = 0; p < num_geoms; p++)
+	{
+		if (lwgeom_is_empty(geoms[p]))
+			continue;
+
+		dbscan_update_context(tree.tree, &cxt, geoms, p, eps);
+		for (i = 0; i < cxt.num_items_found; i++)
+		{
+			uint32_t q = *((uint32_t*) cxt.items_found[i]);
+
+			if (UF_find(uf, p) != UF_find(uf, q))
+			{
+				double mindist = lwgeom_mindistance2d_tolerance(geoms[p], geoms[q], eps);
+				if (mindist == FLT_MAX)
+				{
+					success = LW_FAILURE;
+					break;
+				}
+
+				if (mindist <= eps)
+					UF_union(uf, p, q);
+			}
+		}
+	}
+
+	if (in_a_cluster_ret)
+	{
+		char* in_a_cluster = lwalloc(num_geoms * sizeof(char));
+		for (i = 0; i < num_geoms; i++)
+			in_a_cluster[i] = LW_TRUE;
+		*in_a_cluster_ret = in_a_cluster;
+	}
+
+	if (cxt.items_found)
+		lwfree(cxt.items_found);
+
+	destroy_strtree(&tree);
+
+	return success;
+}
+
+static int
+union_dbscan_general(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
+{
+	uint32_t p, i;
+	struct STRTree tree;
+	struct QueryContext cxt =
+	{
+		.items_found = NULL,
+		.num_items_found = 0,
+		.items_found_size = 0
+	};
+	int success = LW_SUCCESS;
+	uint32_t* neighbors;
 	char* in_a_cluster;
 	char* is_in_core;
 
-	if (num_geoms <= 1)
+	/* Bail if we don't even have enough inputs to make a cluster. */
+	if (num_geoms <= min_points)
 		return LW_SUCCESS;
 
-	tree = make_strtree((void**) geoms, num_geoms, 1);
+	tree = make_strtree((void**) geoms, num_geoms, LW_TRUE);
 	if (tree.tree == NULL)
 	{
-		destroy_strtree(tree);
+		destroy_strtree(&tree);
 		return LW_FAILURE;
 	}
 
@@ -286,22 +405,41 @@
 	memset(in_a_cluster, 0, num_geoms * sizeof(char));
 	is_in_core = lwalloc(num_geoms * sizeof(char));
 	memset(is_in_core, 0, num_geoms * sizeof(char));
+	neighbors = lwalloc(min_points * sizeof(uint32_t));
 
 	for (p = 0; p < num_geoms; p++)
 	{
 		uint32_t num_neighbors = 0;
-		uint32_t* neighbors;
 
 		if (lwgeom_is_empty(geoms[p]))
 			continue;
 
 		dbscan_update_context(tree.tree, &cxt, geoms, p, eps);
-		neighbors = lwalloc(cxt.num_items_found * sizeof(uint32_t*));
 
+		/* We didn't find enough points to do anything, even if they are all within eps. */
+		if (cxt.num_items_found < min_points)
+			continue;
+
 		for (i = 0; i < cxt.num_items_found; i++)
 		{
 			uint32_t q = *((uint32_t*) cxt.items_found[i]);
 
+			if (num_neighbors >= min_points)
+			{
+				/* If we've already identified p as a core point, and it's already
+				 * in the same cluster in q, then there's nothing to learn by
+				 * computing the distance.
+				 */
+				if (UF_find(uf, p) == UF_find(uf, q))
+					continue;
+
+				/* Similarly, if q is already identifed as a border point of another
+				 * cluster, there's no point figuring out what the distance is.
+				 */
+				if (in_a_cluster[q] && !is_in_core[q])
+					continue;
+			}
+
 			double mindist = lwgeom_mindistance2d_tolerance(geoms[p], geoms[q], eps);
 			if (mindist == FLT_MAX)
 			{
@@ -311,44 +449,43 @@
 
 			if (mindist <= eps)
 			{
-				neighbors[num_neighbors++] = q;
-			}
-		}
+				/* If we haven't hit min_points yet, we don't know if we can union p and q.
+				 * Just set q aside for now.
+				 */
+				if (num_neighbors < min_points)
+				{
+					neighbors[num_neighbors++] = q;
 
-		if (!success)
-			break;
-
-		if (num_neighbors >= min_points)
-		{
-			in_a_cluster[p] = LW_TRUE;
-			is_in_core[p] = LW_TRUE;
-
-			for (i = 0; i < num_neighbors; i++)
-			{
-				uint32_t q = neighbors[i];
-
-				if (in_a_cluster[q])
-				{
-					/* Can we merge p's cluster with q's cluster?  We can do this only
-					 * if both p and q are considered _core_ points of their respective
-					 * clusters.
+					/* If we just hit min_points, we can now union all of the neighbor geometries
+					 * we've been saving.
 					 */
-					 if (is_in_core[q])
-					 {
-						 UF_union(uf, p, q);
-					 }
+					if (num_neighbors == min_points)
+					{
+						uint32_t j;
+						is_in_core[p] = LW_TRUE;
+						in_a_cluster[p] = LW_TRUE;
+						for (j = 0; j < num_neighbors; j++)
+						{
+							union_if_available(uf, p, neighbors[j], is_in_core, in_a_cluster);
+						}
+					}
 				}
 				else
 				{
-					UF_union(uf, p, q);
-					in_a_cluster[q] = LW_TRUE;
+					/* If we're above min_points, no need to store our neighbors, just go ahead
+					 * and union them now.  This may allow us to cut out some distance
+					 * computations.
+					 */
+					union_if_available(uf, p, q, is_in_core, in_a_cluster);
 				}
 			}
 		}
 
-		lwfree(neighbors);
+		if (!success)
+			break;
 	}
 
+	lwfree(neighbors);
 	lwfree(is_in_core);
 
 	/* Either pass in_a_cluster to our caller, or free it. */
@@ -360,10 +497,18 @@
 	if (cxt.items_found)
 		lwfree(cxt.items_found);
 
-	destroy_strtree(tree);
+	destroy_strtree(&tree);
 	return success;
 }
 
+int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
+{
+	if (min_points <= 1)
+		return union_dbscan_minpoints_1(geoms, num_geoms, uf, eps, in_a_cluster_ret);
+	else
+		return union_dbscan_general(geoms, num_geoms, uf, eps, min_points, in_a_cluster_ret);
+}
+
 /** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
  *  GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
  *  responsible for freeing the input array, but not the LWGEOM* items inside it. */



More information about the postgis-tickets mailing list