[postgis-tickets] r14934 - #3572, ST_ClusterDBSCAN should not join clusters by their borders
Daniel Baston
dbaston at gmail.com
Mon Jun 6 05:24:37 PDT 2016
Author: dbaston
Date: 2016-06-06 05:24:32 -0700 (Mon, 06 Jun 2016)
New Revision: 14934
Modified:
trunk/doc/reference_measure.xml
trunk/liblwgeom/cunit/cu_geos_cluster.c
trunk/liblwgeom/cunit/cu_tester.h
trunk/liblwgeom/cunit/cu_unionfind.c
trunk/liblwgeom/lwgeom_geos.h
trunk/liblwgeom/lwgeom_geos_cluster.c
trunk/liblwgeom/lwunionfind.c
trunk/liblwgeom/lwunionfind.h
trunk/postgis/lwgeom_geos.c
trunk/postgis/lwgeom_window.c
trunk/regress/cluster_expected
Log:
#3572, ST_ClusterDBSCAN should not join clusters by their borders
Modified: trunk/doc/reference_measure.xml
===================================================================
--- trunk/doc/reference_measure.xml 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/doc/reference_measure.xml 2016-06-06 12:24:32 UTC (rev 14934)
@@ -1077,15 +1077,37 @@
<refsection>
<title>Description</title>
- <para>Returns cluster number for each input geometry, based on a 2D
- implementation of the
+ <para>
+ Returns cluster number for each input geometry, based on a 2D implementation of the
<ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink>
- algorithm. An input geometry will be added to a cluster if it is
- within <varname>eps</varname> distance of at least
- <varname>minpoints</varname> other input geometries. Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
- clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters
- to construct each cluster. Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
- </para>
+ algorithm. Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of clusters to be specified, but instead
+ uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters to construct each cluster.
+ </para>
+
+ <para>
+ An input geometry will be added to a cluster if it is either:
+ <itemizedlist>
+ <listitem>
+ A "core" geometry, that is within <varname>eps</varname> distance of at least <varname>minpoints</varname> other input geometries, or
+ </listitem>
+ <listitem>
+ A "border" geometry, that is within <varname>eps</varname> distance of a core geometry.
+ </listitem>
+ </itemizedlist>
+ </para>
+
+ <para>
+ Note that border geometries may be within <varname>eps</varname> distance of core geometries in more than one cluster; in this
+ case, either assignment would be correct, and the border geometry will be arbitrarily asssigned to one of the available clusters.
+ In these cases, it is possible for a correct cluster to be generated with fewer than <varname>minpoints</varname> geometries.
+ When assignment of a border geometry is ambiguous, repeated calls to ST_ClusterDBSCAN will produce identical results if an ORDER BY
+ clause is included in the window definition, but cluster assignments may differ from other implementations of the same algorithm.
+ </para>
+
+ <para>
+ Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
+ </para>
+
<para>Availability: 2.3.0 - requires GEOS </para>
</refsection>
Modified: trunk/liblwgeom/cunit/cu_geos_cluster.c
===================================================================
--- trunk/liblwgeom/cunit/cu_geos_cluster.c 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/cunit/cu_geos_cluster.c 2016-06-06 12:24:32 UTC (rev 14934)
@@ -255,6 +255,41 @@
perform_cluster_intersecting_test(wkt_inputs_pt, 2, expected_outputs_pt, 1);
}
+static void dbscan_test(void)
+{
+ char* wkt_inputs[] = { "POINT (0 0)", "POINT (-1 0)", "POINT (-1 -0.1)", "POINT (-1 0.1)",
+ "POINT (1 0)",
+ "POINT (2 0)", "POINT (3 0)", "POINT ( 3 -0.1)", "POINT ( 3 0.1)" };
+ uint32_t num_geoms = sizeof(wkt_inputs) / sizeof(char*);
+ LWGEOM** geoms = WKTARRAY2LWGEOM(wkt_inputs, num_geoms);
+ UNIONFIND* uf = UF_create(num_geoms);
+ uint32_t* ids;
+ char* in_a_cluster;
+ uint32_t i;
+
+ /* Although POINT (1 0) and POINT (2 0) are within eps distance of each other,
+ * they do not connect the two clusters because POINT (1 0) is not a core point.
+ * See #3572
+ */
+ double eps = 1.01;
+ uint32_t min_points = 5;
+ uint32_t expected_ids[] = { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 };
+
+ union_dbscan(geoms, num_geoms, uf, eps, min_points, &in_a_cluster);
+ ids = UF_get_collapsed_cluster_ids(uf, in_a_cluster);
+
+ ASSERT_INTARRAY_EQUAL(ids, expected_ids, num_geoms);
+
+ UF_destroy(uf);
+ for (i = 0; i < num_geoms; i++)
+ {
+ lwgeom_free(geoms[i]);
+ }
+ lwfree(geoms);
+ lwfree(in_a_cluster);
+ lwfree(ids);
+}
+
void geos_cluster_suite_setup(void);
void geos_cluster_suite_setup(void)
{
@@ -265,4 +300,5 @@
PG_ADD_TEST(suite, single_input_test);
PG_ADD_TEST(suite, empty_inputs_test);
PG_ADD_TEST(suite, multipoint_test);
+ PG_ADD_TEST(suite, dbscan_test);
}
Modified: trunk/liblwgeom/cunit/cu_tester.h
===================================================================
--- trunk/liblwgeom/cunit/cu_tester.h 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/cunit/cu_tester.h 2016-06-06 12:24:32 UTC (rev 14934)
@@ -54,5 +54,24 @@
CU_ASSERT_TRUE(lwgeom_same(o, e)); \
} while(0);
+#define ASSERT_INTARRAY_EQUAL(o, e, n) do { \
+ size_t i = 0; \
+ for (i = 0; i < n; i++) { \
+ if (o[i] != e[i]) { \
+ fprintf(stderr, "[%s:%d]", __FILE__, __LINE__); \
+ fprintf(stderr, "\nExpected: ["); \
+ for (i = 0; i < n; i++) \
+ fprintf(stderr, " %d", e[i]); \
+ fprintf(stderr, " ]\nObtained: ["); \
+ for (i = 0; i < n; i++) \
+ fprintf(stderr, " %d", o[i]); \
+ fprintf(stderr, " ]\n"); \
+ CU_FAIL(); \
+ break; \
+ } \
+ } \
+ CU_PASS(); \
+} while(0);
+
/* Utility functions */
void do_fn_test(LWGEOM* (*transfn)(LWGEOM*), char *input_wkt, char *expected_wkt);
Modified: trunk/liblwgeom/cunit/cu_unionfind.c
===================================================================
--- trunk/liblwgeom/cunit/cu_unionfind.c 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/cunit/cu_unionfind.c 2016-06-06 12:24:32 UTC (rev 14934)
@@ -22,10 +22,10 @@
uint32_t expected_initial_ids[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint32_t expected_initial_sizes[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
- CU_ASSERT_EQUAL(10, uf->N);
- CU_ASSERT_EQUAL(10, uf->num_clusters);
- CU_ASSERT_EQUAL(0, memcmp(uf->clusters, expected_initial_ids, 10*sizeof(uint32_t)));
- CU_ASSERT_EQUAL(0, memcmp(uf->cluster_sizes, expected_initial_sizes, 10*sizeof(uint32_t)));
+ ASSERT_INT_EQUAL(uf->N, 10);
+ ASSERT_INT_EQUAL(uf->num_clusters, 10);
+ ASSERT_INTARRAY_EQUAL(uf->clusters, expected_initial_ids, 10);
+ ASSERT_INTARRAY_EQUAL(uf->cluster_sizes, expected_initial_sizes, 10);
UF_destroy(uf);
}
@@ -42,10 +42,10 @@
uint32_t expected_final_ids[] = { 0, 2, 2, 2, 4, 5, 6, 0, 0, 9 };
uint32_t expected_final_sizes[] = { 3, 0, 3, 0, 1, 1, 1, 0, 0, 1 };
- CU_ASSERT_EQUAL(10, uf->N);
- CU_ASSERT_EQUAL(6, uf->num_clusters);
- CU_ASSERT_EQUAL(0, memcmp(uf->clusters, expected_final_ids, 10*sizeof(uint32_t)));
- CU_ASSERT_EQUAL(0, memcmp(uf->cluster_sizes, expected_final_sizes, 10*sizeof(uint32_t)));
+ ASSERT_INT_EQUAL(uf->N, 10);
+ ASSERT_INT_EQUAL(uf->num_clusters, 6);
+ ASSERT_INTARRAY_EQUAL(uf->clusters, expected_final_ids, 10);
+ ASSERT_INTARRAY_EQUAL(uf->cluster_sizes, expected_final_sizes, 10);
UF_destroy(uf);
}
@@ -140,17 +140,23 @@
* 7 -> 1
* 8 -> 2
*/
- uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
- uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
+ uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
+ uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, NULL);
- CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+ ASSERT_INTARRAY_EQUAL(collapsed_ids, expected_collapsed_ids, 10);
lwfree(collapsed_ids);
- uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
- collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+ char is_in_cluster[] = { 0, 1, 1, 1, 0, 1, 0, 0, 0, 0 };
+ uint32_t expected_collapsed_ids2[] = { 8, 0, 0, 0, 7, 0, 8, 7, 8, 7 };
- CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+ collapsed_ids = UF_get_collapsed_cluster_ids(uf, is_in_cluster);
+ int i;
+ for (i = 0; i < uf->N; i++)
+ {
+ if (is_in_cluster[i])
+ ASSERT_INT_EQUAL(expected_collapsed_ids2[i], collapsed_ids[i]);
+ }
lwfree(collapsed_ids);
UF_destroy(uf);
Modified: trunk/liblwgeom/lwgeom_geos.h
===================================================================
--- trunk/liblwgeom/lwgeom_geos.h 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwgeom_geos.h 2016-06-06 12:24:32 UTC (rev 14934)
@@ -43,8 +43,7 @@
int cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** clusterGeoms, uint32_t* num_clusters);
int cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
-int cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
-int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points);
+int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** is_in_cluster_ret);
POINTARRAY *ptarray_from_GEOSCoordSeq(const GEOSCoordSequence *cs, char want3d);
Modified: trunk/liblwgeom/lwgeom_geos_cluster.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos_cluster.c 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwgeom_geos_cluster.c 2016-06-06 12:24:32 UTC (rev 14934)
@@ -235,17 +235,7 @@
return cluster_success;
}
-/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
- * GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
- * responsible for freeing the input array, but not the LWGEOM* items inside it. */
-int
-cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
-{
- int min_points = 1;
- return cluster_dbscan(geoms, num_geoms, tolerance, min_points, clusterGeoms, num_clusters);
-}
-
static int
dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geoms, uint32_t p, double eps)
{
@@ -268,7 +258,7 @@
}
int
-union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points)
+union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
{
uint32_t p, i;
struct STRTree tree;
@@ -279,6 +269,8 @@
.items_found_size = 0
};
int success = LW_SUCCESS;
+ char* in_a_cluster;
+ char* is_in_core;
if (num_geoms <= 1)
return LW_SUCCESS;
@@ -290,6 +282,11 @@
return LW_FAILURE;
}
+ in_a_cluster = lwalloc(num_geoms * sizeof(char));
+ memset(in_a_cluster, 0, num_geoms * sizeof(char));
+ is_in_core = lwalloc(num_geoms * sizeof(char));
+ memset(is_in_core, 0, num_geoms * sizeof(char));
+
for (p = 0; p < num_geoms; p++)
{
uint32_t num_neighbors = 0;
@@ -313,23 +310,53 @@
}
if (mindist <= eps)
+ {
neighbors[num_neighbors++] = q;
+ }
}
+ if (!success)
+ break;
+
if (num_neighbors >= min_points)
{
+ in_a_cluster[p] = LW_TRUE;
+ is_in_core[p] = LW_TRUE;
+
for (i = 0; i < num_neighbors; i++)
{
- UF_union(uf, p, neighbors[i]);
+ uint32_t q = neighbors[i];
+
+ if (in_a_cluster[q])
+ {
+ /* Can we merge p's cluster with q's cluster? We can do this only
+ * if both p and q are considered _core_ points of their respective
+ * clusters.
+ */
+ if (is_in_core[q])
+ {
+ UF_union(uf, p, q);
+ }
+ }
+ else
+ {
+ UF_union(uf, p, q);
+ in_a_cluster[q] = LW_TRUE;
+ }
}
}
lwfree(neighbors);
-
- if (!success)
- break;
}
+ lwfree(is_in_core);
+
+ /* Either pass in_a_cluster to our caller, or free it. */
+ if (in_a_cluster_ret)
+ *in_a_cluster_ret = in_a_cluster;
+ else
+ lwfree(in_a_cluster);
+
if (cxt.items_found)
lwfree(cxt.items_found);
@@ -337,13 +364,16 @@
return success;
}
+/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
+ * GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
+ * responsible for freeing the input array, but not the LWGEOM* items inside it. */
int
-cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
+cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
{
int cluster_success;
UNIONFIND* uf = UF_create(num_geoms);
- if (union_dbscan(geoms, num_geoms, uf, eps, min_points) == LW_FAILURE)
+ if (union_dbscan(geoms, num_geoms, uf, tolerance, 1, NULL) == LW_FAILURE)
{
UF_destroy(uf);
return LW_FAILURE;
Modified: trunk/liblwgeom/lwunionfind.c
===================================================================
--- trunk/liblwgeom/lwunionfind.c 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwunionfind.c 2016-06-06 12:24:32 UTC (rev 14934)
@@ -142,25 +142,25 @@
}
uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, const char* is_in_cluster)
{
uint32_t* ordered_components = UF_ordered_by_cluster(uf);
uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
uint32_t last_old_id, current_new_id, i;
+ char encountered_cluster = LW_FALSE;
- last_old_id = UF_find(uf, ordered_components[0]);
- current_new_id = 1;
+ current_new_id = 0;
for (i = 0; i < uf->N; i++)
{
uint32_t j = ordered_components[i];
-
- if (UF_size(uf, j) < min_cluster_size)
+ if (!is_in_cluster || is_in_cluster[j])
{
- new_ids[j] = noise_cluster_id;
- }
- else
- {
uint32_t current_old_id = UF_find(uf, j);
+ if (!encountered_cluster)
+ {
+ encountered_cluster = LW_TRUE;
+ last_old_id = current_old_id;
+ }
if (current_old_id != last_old_id)
current_new_id++;
Modified: trunk/liblwgeom/lwunionfind.h
===================================================================
--- trunk/liblwgeom/lwunionfind.h 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwunionfind.h 2016-06-06 12:24:32 UTC (rev 14934)
@@ -56,9 +56,9 @@
uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
/* Replace the cluster ids in a UNIONFIND with sequential ids starting at one.
- * If min_cluster_size is greater than 1, any clusters with fewer than
- * min_cluster_size items will be assigned to noise_cluster_id.
+ * If is_in_cluster array is provided, it will be used to skip any indexes
+ * that are not in a cluster.
* */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, const char* is_in_cluster);
#endif
Modified: trunk/postgis/lwgeom_geos.c
===================================================================
--- trunk/postgis/lwgeom_geos.c 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/postgis/lwgeom_geos.c 2016-06-06 12:24:32 UTC (rev 14934)
@@ -3191,7 +3191,6 @@
LWGEOM** lw_inputs;
LWGEOM** lw_results;
double tolerance;
- int min_points;
int srid=SRID_UNKNOWN;
/* Parameters used to construct a result array */
@@ -3212,17 +3211,6 @@
PG_RETURN_NULL();
}
- if (PG_NARGS() < 3)
- min_points = 1;
- else
- min_points = PG_GETARG_INT32(2);
-
- if (min_points < 1)
- {
- lwpgerror("min_points must be a positive number.");
- PG_RETURN_NULL();
- }
-
nelems = array_nelems_not_null(array);
POSTGIS_DEBUGF(3, "cluster_within_distance_garray: number of non-null elements: %d", nelems);
@@ -3240,7 +3228,7 @@
PG_RETURN_NULL();
}
- if (cluster_dbscan(lw_inputs, nelems, tolerance, min_points, &lw_results, &nclusters) != LW_SUCCESS)
+ if (cluster_within_distance(lw_inputs, nelems, tolerance, &lw_results, &nclusters) != LW_SUCCESS)
{
elog(ERROR, "cluster_within: Error performing clustering");
PG_RETURN_NULL();
Modified: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/postgis/lwgeom_window.c 2016-06-06 12:24:32 UTC (rev 14934)
@@ -92,6 +92,7 @@
uint32_t i;
uint32_t* result_ids;
LWGEOM** geoms;
+ char* is_in_cluster;
UNIONFIND* uf;
bool tolerance_is_null;
bool minpoints_is_null;
@@ -127,7 +128,7 @@
}
}
- if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints) == LW_SUCCESS)
+ if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints, &is_in_cluster) == LW_SUCCESS)
context->is_error = LW_FALSE;
for (i = 0; i < ngeoms; i++)
@@ -139,26 +140,21 @@
if (context->is_error)
{
UF_destroy(uf);
+ lwfree(is_in_cluster);
lwpgerror("Error during clustering");
PG_RETURN_NULL();
}
- result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
+ result_ids = UF_get_collapsed_cluster_ids(uf, is_in_cluster);
for (i = 0; i < ngeoms; i++)
{
- if (result_ids[i] == 0)
+ if (!is_in_cluster[i])
{
context->cluster_assignments[i].is_null = LW_TRUE;
}
else
{
- /* We overloaded the zero cluster ID above to tag "noise" geometries
- * that are not part of any cluster. Now that those have been
- * properly converted into NULLs, we need to subtract one from
- * the collapsed cluster IDs so that we get a zero-based sequence,
- * consistent with our good friend ST_ClusterKMeans.
- */
- context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+ context->cluster_assignments[i].cluster_id = result_ids[i];
}
}
Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected 2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/regress/cluster_expected 2016-06-06 12:24:32 UTC (rev 14934)
@@ -24,6 +24,6 @@
t103|1|
t103|2|
t103|3|
-t103|4|1
-t103|5|1
-t103|6|1
+t103|4|0
+t103|5|0
+t103|6|0
More information about the postgis-tickets
mailing list