[postgis-tickets] r14979 - Optimize and reduce memory usage in ST_ClusterDBSCAN / ST_ClusterWithin
Daniel Baston
dbaston at gmail.com
Fri Jun 24 18:30:12 PDT 2016
Author: dbaston
Date: 2016-06-24 18:30:12 -0700 (Fri, 24 Jun 2016)
New Revision: 14979
Modified:
trunk/liblwgeom/lwgeom_geos.c
trunk/liblwgeom/lwgeom_geos.h
trunk/liblwgeom/lwgeom_geos_cluster.c
Log:
Optimize and reduce memory usage in ST_ClusterDBSCAN / ST_ClusterWithin
Modified: trunk/liblwgeom/lwgeom_geos.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos.c 2016-06-24 06:34:41 UTC (rev 14978)
+++ trunk/liblwgeom/lwgeom_geos.c 2016-06-25 01:30:12 UTC (rev 14979)
@@ -482,6 +482,44 @@
return g;
}
+GEOSGeometry*
+make_geos_point(double x, double y)
+{
+ GEOSCoordSequence* seq = GEOSCoordSeq_create(1, 2);
+ GEOSGeometry* geom = NULL;
+
+ if (!seq)
+ return NULL;
+
+ GEOSCoordSeq_setX(seq, 0, x);
+ GEOSCoordSeq_setY(seq, 0, y);
+
+ geom = GEOSGeom_createPoint(seq);
+ if (!geom)
+ GEOSCoordSeq_destroy(seq);
+ return geom;
+}
+
+GEOSGeometry*
+make_geos_segment(double x1, double y1, double x2, double y2)
+{
+ GEOSCoordSequence* seq = GEOSCoordSeq_create(2, 2);
+ GEOSGeometry* geom = NULL;
+
+ if (!seq)
+ return NULL;
+
+ GEOSCoordSeq_setX(seq, 0, x1);
+ GEOSCoordSeq_setY(seq, 0, y1);
+ GEOSCoordSeq_setX(seq, 1, x2);
+ GEOSCoordSeq_setY(seq, 1, y2);
+
+ geom = GEOSGeom_createLineString(seq);
+ if (!geom)
+ GEOSCoordSeq_destroy(seq);
+ return geom;
+}
+
const char*
lwgeom_geos_version()
{
Modified: trunk/liblwgeom/lwgeom_geos.h
===================================================================
--- trunk/liblwgeom/lwgeom_geos.h 2016-06-24 06:34:41 UTC (rev 14978)
+++ trunk/liblwgeom/lwgeom_geos.h 2016-06-25 01:30:12 UTC (rev 14979)
@@ -41,6 +41,9 @@
GEOSGeometry * GBOX2GEOS(const GBOX *g);
GEOSGeometry * LWGEOM_GEOS_buildArea(const GEOSGeometry* geom_in);
+GEOSGeometry * make_geos_point(double x, double y);
+GEOSGeometry * make_geos_segment(double x1, double y1, double x2, double y2);
+
int cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** clusterGeoms, uint32_t* num_clusters);
int cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** is_in_cluster_ret);
Modified: trunk/liblwgeom/lwgeom_geos_cluster.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos_cluster.c 2016-06-24 06:34:41 UTC (rev 14978)
+++ trunk/liblwgeom/lwgeom_geos_cluster.c 2016-06-25 01:30:12 UTC (rev 14979)
@@ -18,7 +18,7 @@
*
**********************************************************************
*
- * Copyright 2015 Daniel Baston <dbaston at gmail.com>
+ * Copyright 2015-2016 Daniel Baston <dbaston at gmail.com>
*
**********************************************************************/
@@ -31,7 +31,7 @@
static const int STRTREE_NODE_CAPACITY = 10;
-/* Utility struct used to accumulate geometries in GEOSSTRtree_query callback */
+/* Utility struct used to accumulate items in GEOSSTRtree_query callback */
struct QueryContext
{
void** items_found;
@@ -49,11 +49,34 @@
};
static struct STRTree make_strtree(void** geoms, uint32_t num_geoms, char is_lwgeom);
-static void destroy_strtree(struct STRTree tree);
+static void destroy_strtree(struct STRTree * tree);
static int union_intersecting_pairs(GEOSGeometry** geoms, uint32_t num_geoms, UNIONFIND* uf);
static int combine_geometries(UNIONFIND* uf, void** geoms, uint32_t num_geoms, void*** clustersGeoms, uint32_t* num_clusters, char is_lwgeom);
-/** Make a GEOSSTRtree of either GEOSGeometry* or LWGEOM* pointers */
+/* Make a minimal GEOSGeometry* whose Envelope covers the same 2D extent as
+ * the supplied GBOX. This is faster and uses less memory than building a
+ * five-point polygon with GBOX2GEOS.
+ */
+static GEOSGeometry*
+geos_envelope_surrogate(const LWGEOM* g)
+{
+ if (lwgeom_is_empty(g))
+ return GEOSGeom_createEmptyPolygon();
+
+ if (lwgeom_get_type(g) == POINTTYPE) {
+ const POINT2D* pt = getPoint2d_cp(lwgeom_as_lwpoint(g)->point, 0);
+ return make_geos_point(pt->x, pt->y);
+ } else {
+ const GBOX* box = lwgeom_get_bbox(g);
+ if (!box)
+ return NULL;
+
+ return make_geos_segment(box->xmin, box->ymin, box->xmax, box->ymax);
+ }
+}
+
+/** Make a GEOSSTRtree that stores a pointer to a variable containing
+ * the array index of the input geoms */
static struct STRTree
make_strtree(void** geoms, uint32_t num_geoms, char is_lwgeom)
{
@@ -63,49 +86,50 @@
{
return tree;
}
- tree.envelopes = lwalloc(num_geoms * sizeof(GEOSGeometry*));
tree.geom_ids = lwalloc(num_geoms * sizeof(uint32_t));
tree.num_geoms = num_geoms;
- size_t i;
- for (i = 0; i < num_geoms; i++)
+ if (is_lwgeom)
{
- tree.geom_ids[i] = i;
- if (!is_lwgeom)
+ uint32_t i;
+ tree.envelopes = lwalloc(num_geoms * sizeof(GEOSGeometry*));
+ for (i = 0; i < num_geoms; i++)
{
- tree.envelopes[i] = GEOSEnvelope(geoms[i]);
+ tree.geom_ids[i] = i;
+ tree.envelopes[i] = geos_envelope_surrogate(geoms[i]);
+ GEOSSTRtree_insert(tree.tree, tree.envelopes[i], &(tree.geom_ids[i]));
}
- else
+ }
+ else
+ {
+ uint32_t i;
+ tree.envelopes = NULL;
+ for (i = 0; i < num_geoms; i++)
{
- const GBOX* box = lwgeom_get_bbox(geoms[i]);
- if (box)
- {
- tree.envelopes[i] = GBOX2GEOS(box);
- }
- else
- {
- /* Empty geometry */
- tree.envelopes[i] = GEOSGeom_createEmptyPolygon();
- }
+ tree.geom_ids[i] = i;
+ GEOSSTRtree_insert(tree.tree, geoms[i], &(tree.geom_ids[i]));
}
- GEOSSTRtree_insert(tree.tree, tree.envelopes[i], &(tree.geom_ids[i]));
}
+
return tree;
}
/** Clean up STRTree after use */
static void
-destroy_strtree(struct STRTree tree)
+destroy_strtree(struct STRTree * tree)
{
size_t i;
- GEOSSTRtree_destroy(tree.tree);
+ GEOSSTRtree_destroy(tree->tree);
- for (i = 0; i < tree.num_geoms; i++)
+ if (tree->envelopes)
{
- GEOSGeom_destroy(tree.envelopes[i]);
+ for (i = 0; i < tree->num_geoms; i++)
+ {
+ GEOSGeom_destroy(tree->envelopes[i]);
+ }
+ lwfree(tree->envelopes);
}
- lwfree(tree.geom_ids);
- lwfree(tree.envelopes);
+ lwfree(tree->geom_ids);
}
static void
@@ -143,24 +167,22 @@
if (num_geoms <= 1)
return LW_SUCCESS;
- tree = make_strtree((void**) geoms, num_geoms, 0);
+ tree = make_strtree((void**) geoms, num_geoms, LW_FALSE);
if (tree.tree == NULL)
{
- destroy_strtree(tree);
+ destroy_strtree(&tree);
return LW_FAILURE;
}
for (p = 0; p < num_geoms; p++)
{
const GEOSPreparedGeometry* prep = NULL;
- GEOSGeometry* query_envelope;
if (GEOSisEmpty(geoms[p]))
continue;
cxt.num_items_found = 0;
- query_envelope = GEOSEnvelope(geoms[p]);
- GEOSSTRtree_query(tree.tree, query_envelope, &query_accumulate, &cxt);
+ GEOSSTRtree_query(tree.tree, geoms[p], &query_accumulate, &cxt);
for (i = 0; i < cxt.num_items_found; i++)
{
@@ -202,7 +224,6 @@
if (prep)
GEOSPreparedGeom_destroy(prep);
- GEOSGeom_destroy(query_envelope);
if (!success)
break;
@@ -211,7 +232,7 @@
if (cxt.items_found)
lwfree(cxt.items_found);
- destroy_strtree(tree);
+ destroy_strtree(&tree);
return success;
}
@@ -235,31 +256,63 @@
return cluster_success;
}
-
static int
dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geoms, uint32_t p, double eps)
{
cxt->num_items_found = 0;
- const GBOX* geom_extent = lwgeom_get_bbox(geoms[p]);
- GBOX* query_extent = gbox_clone(geom_extent);
- gbox_expand(query_extent, eps);
- GEOSGeometry* query_envelope = GBOX2GEOS(query_extent);
+ GEOSGeometry* query_envelope;
+ if (geoms[p]->type == POINTTYPE)
+ {
+ const POINT2D* pt = getPoint2d_cp(lwgeom_as_lwpoint(geoms[p])->point, 0);
+ query_envelope = make_geos_segment( pt->x - eps, pt->y - eps, pt->x + eps, pt->y + eps );
+ } else {
+ const GBOX* box = lwgeom_get_bbox(geoms[p]);
+ query_envelope = make_geos_segment( box->xmin - eps, box->ymin - eps, box->xmax + eps, box->ymax + eps );
+ }
if (!query_envelope)
return LW_FAILURE;
GEOSSTRtree_query(tree, query_envelope, &query_accumulate, cxt);
- lwfree(query_extent);
GEOSGeom_destroy(query_envelope);
return LW_SUCCESS;
}
-int
-union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
+/* Union p's cluster with q's cluster, if q is not a border point of another cluster.
+ * Applicable to DBSCAN with minpoints > 1.
+ */
+static void
+union_if_available(UNIONFIND* uf, uint32_t p, uint32_t q, char* is_in_core, char* in_a_cluster)
{
+ if (in_a_cluster[q])
+ {
+ /* Can we merge p's cluster with q's cluster? We can do this only
+ * if both p and q are considered _core_ points of their respective
+ * clusters.
+ */
+ if (is_in_core[q])
+ {
+ UF_union(uf, p, q);
+ }
+ }
+ else
+ {
+ UF_union(uf, p, q);
+ in_a_cluster[q] = LW_TRUE;
+ }
+}
+
+/* An optimized DBSCAN union for the case where min_points == 1.
+ * If min_points == 1, then we don't care how many neighbors we find; we can union clusters
+ * on the fly, as as we go through the distance calculations. This potentially allows us
+ * to avoid some distance computations altogether.
+ */
+static int
+union_dbscan_minpoints_1(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, char** in_a_cluster_ret)
+{
uint32_t p, i;
struct STRTree tree;
struct QueryContext cxt =
@@ -269,16 +322,82 @@
.items_found_size = 0
};
int success = LW_SUCCESS;
+
+ if (num_geoms <= 1)
+ return LW_SUCCESS;
+
+ tree = make_strtree((void**) geoms, num_geoms, LW_TRUE);
+ if (tree.tree == NULL)
+ {
+ destroy_strtree(&tree);
+ return LW_FAILURE;
+ }
+
+ for (p = 0; p < num_geoms; p++)
+ {
+ if (lwgeom_is_empty(geoms[p]))
+ continue;
+
+ dbscan_update_context(tree.tree, &cxt, geoms, p, eps);
+ for (i = 0; i < cxt.num_items_found; i++)
+ {
+ uint32_t q = *((uint32_t*) cxt.items_found[i]);
+
+ if (UF_find(uf, p) != UF_find(uf, q))
+ {
+ double mindist = lwgeom_mindistance2d_tolerance(geoms[p], geoms[q], eps);
+ if (mindist == FLT_MAX)
+ {
+ success = LW_FAILURE;
+ break;
+ }
+
+ if (mindist <= eps)
+ UF_union(uf, p, q);
+ }
+ }
+ }
+
+ if (in_a_cluster_ret)
+ {
+ char* in_a_cluster = lwalloc(num_geoms * sizeof(char));
+ for (i = 0; i < num_geoms; i++)
+ in_a_cluster[i] = LW_TRUE;
+ *in_a_cluster_ret = in_a_cluster;
+ }
+
+ if (cxt.items_found)
+ lwfree(cxt.items_found);
+
+ destroy_strtree(&tree);
+
+ return success;
+}
+
+static int
+union_dbscan_general(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
+{
+ uint32_t p, i;
+ struct STRTree tree;
+ struct QueryContext cxt =
+ {
+ .items_found = NULL,
+ .num_items_found = 0,
+ .items_found_size = 0
+ };
+ int success = LW_SUCCESS;
+ uint32_t* neighbors;
char* in_a_cluster;
char* is_in_core;
- if (num_geoms <= 1)
+ /* Bail if we don't even have enough inputs to make a cluster. */
+ if (num_geoms <= min_points)
return LW_SUCCESS;
- tree = make_strtree((void**) geoms, num_geoms, 1);
+ tree = make_strtree((void**) geoms, num_geoms, LW_TRUE);
if (tree.tree == NULL)
{
- destroy_strtree(tree);
+ destroy_strtree(&tree);
return LW_FAILURE;
}
@@ -286,22 +405,41 @@
memset(in_a_cluster, 0, num_geoms * sizeof(char));
is_in_core = lwalloc(num_geoms * sizeof(char));
memset(is_in_core, 0, num_geoms * sizeof(char));
+ neighbors = lwalloc(min_points * sizeof(uint32_t));
for (p = 0; p < num_geoms; p++)
{
uint32_t num_neighbors = 0;
- uint32_t* neighbors;
if (lwgeom_is_empty(geoms[p]))
continue;
dbscan_update_context(tree.tree, &cxt, geoms, p, eps);
- neighbors = lwalloc(cxt.num_items_found * sizeof(uint32_t*));
+ /* We didn't find enough points to do anything, even if they are all within eps. */
+ if (cxt.num_items_found < min_points)
+ continue;
+
for (i = 0; i < cxt.num_items_found; i++)
{
uint32_t q = *((uint32_t*) cxt.items_found[i]);
+ if (num_neighbors >= min_points)
+ {
+ /* If we've already identified p as a core point, and it's already
+ * in the same cluster in q, then there's nothing to learn by
+ * computing the distance.
+ */
+ if (UF_find(uf, p) == UF_find(uf, q))
+ continue;
+
+ /* Similarly, if q is already identifed as a border point of another
+ * cluster, there's no point figuring out what the distance is.
+ */
+ if (in_a_cluster[q] && !is_in_core[q])
+ continue;
+ }
+
double mindist = lwgeom_mindistance2d_tolerance(geoms[p], geoms[q], eps);
if (mindist == FLT_MAX)
{
@@ -311,44 +449,43 @@
if (mindist <= eps)
{
- neighbors[num_neighbors++] = q;
- }
- }
+ /* If we haven't hit min_points yet, we don't know if we can union p and q.
+ * Just set q aside for now.
+ */
+ if (num_neighbors < min_points)
+ {
+ neighbors[num_neighbors++] = q;
- if (!success)
- break;
-
- if (num_neighbors >= min_points)
- {
- in_a_cluster[p] = LW_TRUE;
- is_in_core[p] = LW_TRUE;
-
- for (i = 0; i < num_neighbors; i++)
- {
- uint32_t q = neighbors[i];
-
- if (in_a_cluster[q])
- {
- /* Can we merge p's cluster with q's cluster? We can do this only
- * if both p and q are considered _core_ points of their respective
- * clusters.
+ /* If we just hit min_points, we can now union all of the neighbor geometries
+ * we've been saving.
*/
- if (is_in_core[q])
- {
- UF_union(uf, p, q);
- }
+ if (num_neighbors == min_points)
+ {
+ uint32_t j;
+ is_in_core[p] = LW_TRUE;
+ in_a_cluster[p] = LW_TRUE;
+ for (j = 0; j < num_neighbors; j++)
+ {
+ union_if_available(uf, p, neighbors[j], is_in_core, in_a_cluster);
+ }
+ }
}
else
{
- UF_union(uf, p, q);
- in_a_cluster[q] = LW_TRUE;
+ /* If we're above min_points, no need to store our neighbors, just go ahead
+ * and union them now. This may allow us to cut out some distance
+ * computations.
+ */
+ union_if_available(uf, p, q, is_in_core, in_a_cluster);
}
}
}
- lwfree(neighbors);
+ if (!success)
+ break;
}
+ lwfree(neighbors);
lwfree(is_in_core);
/* Either pass in_a_cluster to our caller, or free it. */
@@ -360,10 +497,18 @@
if (cxt.items_found)
lwfree(cxt.items_found);
- destroy_strtree(tree);
+ destroy_strtree(&tree);
return success;
}
+int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
+{
+ if (min_points <= 1)
+ return union_dbscan_minpoints_1(geoms, num_geoms, uf, eps, in_a_cluster_ret);
+ else
+ return union_dbscan_general(geoms, num_geoms, uf, eps, min_points, in_a_cluster_ret);
+}
+
/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
* GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
* responsible for freeing the input array, but not the LWGEOM* items inside it. */
More information about the postgis-tickets
mailing list