[postgis-tickets] r14934 - #3572, ST_ClusterDBSCAN should not join clusters by their borders

Daniel Baston dbaston at gmail.com
Mon Jun 6 05:24:37 PDT 2016


Author: dbaston
Date: 2016-06-06 05:24:32 -0700 (Mon, 06 Jun 2016)
New Revision: 14934

Modified:
   trunk/doc/reference_measure.xml
   trunk/liblwgeom/cunit/cu_geos_cluster.c
   trunk/liblwgeom/cunit/cu_tester.h
   trunk/liblwgeom/cunit/cu_unionfind.c
   trunk/liblwgeom/lwgeom_geos.h
   trunk/liblwgeom/lwgeom_geos_cluster.c
   trunk/liblwgeom/lwunionfind.c
   trunk/liblwgeom/lwunionfind.h
   trunk/postgis/lwgeom_geos.c
   trunk/postgis/lwgeom_window.c
   trunk/regress/cluster_expected
Log:
#3572, ST_ClusterDBSCAN should not join clusters by their borders

Modified: trunk/doc/reference_measure.xml
===================================================================
--- trunk/doc/reference_measure.xml	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/doc/reference_measure.xml	2016-06-06 12:24:32 UTC (rev 14934)
@@ -1077,15 +1077,37 @@
 	  <refsection>
       <title>Description</title>
 
-      <para>Returns cluster number for each input geometry, based on a 2D 
-          implementation of the 
+	  <para>
+		  Returns cluster number for each input geometry, based on a 2D implementation of the 
           <ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink> 
-          algorithm.  An input geometry will be added to a cluster if it is 
-          within <varname>eps</varname> distance of at least
-          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
-          clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters 
-          to construct each cluster.  Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
-      </para>
+		  algorithm.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of clusters to be specified, but instead 
+		  uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters to construct each cluster.
+	  </para>
+		  
+	  <para>
+		  An input geometry will be added to a cluster if it is either:
+		  <itemizedlist>
+			  <listitem>
+				  A "core" geometry, that is within <varname>eps</varname> distance of at least <varname>minpoints</varname> other input geometries, or
+			  </listitem>
+			  <listitem>
+				  A "border" geometry, that is within <varname>eps</varname> distance of a core geometry.
+			  </listitem>
+		  </itemizedlist>
+		</para>
+
+		<para>
+		  Note that border geometries may be within <varname>eps</varname> distance of core geometries in more than one cluster; in this
+		  case, either assignment would be correct, and the border geometry will be arbitrarily asssigned to one of the available clusters.
+		  In these cases, it is possible for a correct cluster to be generated with fewer than <varname>minpoints</varname> geometries.
+		  When assignment of a border geometry is ambiguous, repeated calls to ST_ClusterDBSCAN will produce identical results if an ORDER BY 
+		  clause is included in the window definition, but cluster assignments may differ from other implementations of the same algorithm.
+	  </para>
+
+	  <para>
+		  Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
+	  </para>
+
       <para>Availability: 2.3.0 - requires GEOS </para>
     </refsection>
 

Modified: trunk/liblwgeom/cunit/cu_geos_cluster.c
===================================================================
--- trunk/liblwgeom/cunit/cu_geos_cluster.c	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/cunit/cu_geos_cluster.c	2016-06-06 12:24:32 UTC (rev 14934)
@@ -255,6 +255,41 @@
 	perform_cluster_intersecting_test(wkt_inputs_pt, 2, expected_outputs_pt, 1);
 }
 
+static void dbscan_test(void)
+{
+	char* wkt_inputs[] = { "POINT (0 0)", "POINT (-1 0)", "POINT (-1 -0.1)", "POINT (-1 0.1)",
+		                   "POINT (1 0)",
+						   "POINT (2 0)", "POINT (3  0)", "POINT ( 3 -0.1)", "POINT ( 3 0.1)" };
+	uint32_t num_geoms = sizeof(wkt_inputs) / sizeof(char*);
+	LWGEOM** geoms = WKTARRAY2LWGEOM(wkt_inputs, num_geoms);
+	UNIONFIND* uf = UF_create(num_geoms);
+	uint32_t* ids;
+	char* in_a_cluster;
+	uint32_t i;
+
+	/* Although POINT (1 0) and POINT (2 0) are within eps distance of each other,
+	 * they do not connect the two clusters because POINT (1 0) is not a core point.
+	 * See #3572
+	 */
+	double eps = 1.01;
+	uint32_t min_points = 5;
+	uint32_t expected_ids[] = { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1 };
+
+	union_dbscan(geoms, num_geoms, uf, eps, min_points, &in_a_cluster);
+	ids = UF_get_collapsed_cluster_ids(uf, in_a_cluster);
+
+	ASSERT_INTARRAY_EQUAL(ids, expected_ids, num_geoms);
+
+	UF_destroy(uf);
+	for (i = 0; i < num_geoms; i++)
+	{
+		lwgeom_free(geoms[i]);
+	}
+	lwfree(geoms);
+	lwfree(in_a_cluster);
+	lwfree(ids);
+}
+
 void geos_cluster_suite_setup(void);
 void geos_cluster_suite_setup(void)
 {
@@ -265,4 +300,5 @@
 	PG_ADD_TEST(suite, single_input_test);
 	PG_ADD_TEST(suite, empty_inputs_test);
 	PG_ADD_TEST(suite, multipoint_test);
+	PG_ADD_TEST(suite, dbscan_test);
 }

Modified: trunk/liblwgeom/cunit/cu_tester.h
===================================================================
--- trunk/liblwgeom/cunit/cu_tester.h	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/cunit/cu_tester.h	2016-06-06 12:24:32 UTC (rev 14934)
@@ -54,5 +54,24 @@
 	CU_ASSERT_TRUE(lwgeom_same(o, e)); \
 } while(0);
 
+#define ASSERT_INTARRAY_EQUAL(o, e, n) do { \
+	size_t i = 0; \
+	for (i = 0; i < n; i++) { \
+		if (o[i] != e[i]) { \
+			fprintf(stderr, "[%s:%d]", __FILE__, __LINE__); \
+			fprintf(stderr, "\nExpected: ["); \
+			for (i = 0; i < n; i++) \
+				fprintf(stderr, " %d", e[i]); \
+			fprintf(stderr, " ]\nObtained: ["); \
+			for (i = 0; i < n; i++) \
+				fprintf(stderr, " %d", o[i]); \
+			fprintf(stderr, " ]\n"); \
+			CU_FAIL(); \
+			break; \
+		} \
+	} \
+	CU_PASS(); \
+} while(0);
+
 /* Utility functions */
 void do_fn_test(LWGEOM* (*transfn)(LWGEOM*), char *input_wkt, char *expected_wkt);

Modified: trunk/liblwgeom/cunit/cu_unionfind.c
===================================================================
--- trunk/liblwgeom/cunit/cu_unionfind.c	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/cunit/cu_unionfind.c	2016-06-06 12:24:32 UTC (rev 14934)
@@ -22,10 +22,10 @@
 	uint32_t expected_initial_ids[] =   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
 	uint32_t expected_initial_sizes[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
 
-	CU_ASSERT_EQUAL(10, uf->N);
-	CU_ASSERT_EQUAL(10, uf->num_clusters);
-	CU_ASSERT_EQUAL(0, memcmp(uf->clusters, expected_initial_ids, 10*sizeof(uint32_t)));
-	CU_ASSERT_EQUAL(0, memcmp(uf->cluster_sizes, expected_initial_sizes, 10*sizeof(uint32_t)));
+	ASSERT_INT_EQUAL(uf->N, 10);
+	ASSERT_INT_EQUAL(uf->num_clusters, 10);
+	ASSERT_INTARRAY_EQUAL(uf->clusters, expected_initial_ids, 10);
+	ASSERT_INTARRAY_EQUAL(uf->cluster_sizes, expected_initial_sizes, 10);
 
 	UF_destroy(uf);
 }
@@ -42,10 +42,10 @@
 	uint32_t expected_final_ids[] =   { 0, 2, 2, 2, 4, 5, 6, 0, 0, 9 };
 	uint32_t expected_final_sizes[] = { 3, 0, 3, 0, 1, 1, 1, 0, 0, 1 };
 
-	CU_ASSERT_EQUAL(10, uf->N);
-	CU_ASSERT_EQUAL(6, uf->num_clusters);
-	CU_ASSERT_EQUAL(0, memcmp(uf->clusters, expected_final_ids, 10*sizeof(uint32_t)));
-	CU_ASSERT_EQUAL(0, memcmp(uf->cluster_sizes, expected_final_sizes, 10*sizeof(uint32_t)));
+	ASSERT_INT_EQUAL(uf->N, 10);
+	ASSERT_INT_EQUAL(uf->num_clusters, 6);
+	ASSERT_INTARRAY_EQUAL(uf->clusters, expected_final_ids, 10);
+	ASSERT_INTARRAY_EQUAL(uf->cluster_sizes, expected_final_sizes, 10);
 
 	UF_destroy(uf);
 }
@@ -140,17 +140,23 @@
 	 * 7 -> 1
 	 * 8 -> 2
 	 */
-	uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
-	uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
+	uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
+	uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, NULL);
 
-	CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+	ASSERT_INTARRAY_EQUAL(collapsed_ids, expected_collapsed_ids, 10);
 
 	lwfree(collapsed_ids);
 
-	uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
-	collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+	char is_in_cluster[] = { 0, 1, 1, 1, 0, 1, 0, 0, 0, 0 };
+	uint32_t expected_collapsed_ids2[] = { 8, 0, 0, 0, 7, 0, 8, 7, 8, 7 };
 
-	CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+	collapsed_ids = UF_get_collapsed_cluster_ids(uf, is_in_cluster);
+	int i;
+	for (i = 0; i < uf->N; i++)
+	{ 
+		if (is_in_cluster[i])
+			ASSERT_INT_EQUAL(expected_collapsed_ids2[i], collapsed_ids[i]);
+	}
 
 	lwfree(collapsed_ids);
 	UF_destroy(uf);

Modified: trunk/liblwgeom/lwgeom_geos.h
===================================================================
--- trunk/liblwgeom/lwgeom_geos.h	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwgeom_geos.h	2016-06-06 12:24:32 UTC (rev 14934)
@@ -43,8 +43,7 @@
 
 int cluster_intersecting(GEOSGeometry** geoms, uint32_t num_geoms, GEOSGeometry*** clusterGeoms, uint32_t* num_clusters);
 int cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
-int cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters);
-int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points);
+int union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** is_in_cluster_ret);
 
 POINTARRAY *ptarray_from_GEOSCoordSeq(const GEOSCoordSequence *cs, char want3d);
 

Modified: trunk/liblwgeom/lwgeom_geos_cluster.c
===================================================================
--- trunk/liblwgeom/lwgeom_geos_cluster.c	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwgeom_geos_cluster.c	2016-06-06 12:24:32 UTC (rev 14934)
@@ -235,17 +235,7 @@
 	return cluster_success;
 }
 
-/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
- *  GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
- *  responsible for freeing the input array, but not the LWGEOM* items inside it. */
-int
-cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
-{
-	int min_points = 1;
-	return cluster_dbscan(geoms, num_geoms, tolerance, min_points, clusterGeoms, num_clusters);
-}
 
-
 static int
 dbscan_update_context(GEOSSTRtree* tree, struct QueryContext* cxt, LWGEOM** geoms, uint32_t p, double eps)
 {
@@ -268,7 +258,7 @@
 }
 
 int
-union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points)
+union_dbscan(LWGEOM** geoms, uint32_t num_geoms, UNIONFIND* uf, double eps, uint32_t min_points, char** in_a_cluster_ret)
 {
 	uint32_t p, i;
 	struct STRTree tree;
@@ -279,6 +269,8 @@
 		.items_found_size = 0
 	};
 	int success = LW_SUCCESS;
+	char* in_a_cluster;
+	char* is_in_core;
 
 	if (num_geoms <= 1)
 		return LW_SUCCESS;
@@ -290,6 +282,11 @@
 		return LW_FAILURE;
 	}
 
+	in_a_cluster = lwalloc(num_geoms * sizeof(char));
+	memset(in_a_cluster, 0, num_geoms * sizeof(char));
+	is_in_core = lwalloc(num_geoms * sizeof(char));
+	memset(is_in_core, 0, num_geoms * sizeof(char));
+
 	for (p = 0; p < num_geoms; p++)
 	{
 		uint32_t num_neighbors = 0;
@@ -313,23 +310,53 @@
 			}
 
 			if (mindist <= eps)
+			{
 				neighbors[num_neighbors++] = q;
+			}
 		}
 
+		if (!success)
+			break;
+
 		if (num_neighbors >= min_points)
 		{
+			in_a_cluster[p] = LW_TRUE;
+			is_in_core[p] = LW_TRUE;
+
 			for (i = 0; i < num_neighbors; i++)
 			{
-				UF_union(uf, p, neighbors[i]);
+				uint32_t q = neighbors[i];
+
+				if (in_a_cluster[q])
+				{
+					/* Can we merge p's cluster with q's cluster?  We can do this only
+					 * if both p and q are considered _core_ points of their respective
+					 * clusters.
+					 */
+					 if (is_in_core[q])
+					 {
+						 UF_union(uf, p, q);
+					 }
+				}
+				else
+				{
+					UF_union(uf, p, q);
+					in_a_cluster[q] = LW_TRUE;
+				}
 			}
 		}
 
 		lwfree(neighbors);
-
-		if (!success)
-			break;
 	}
 
+	lwfree(is_in_core);
+
+	/* Either pass in_a_cluster to our caller, or free it. */
+	if (in_a_cluster_ret)
+		*in_a_cluster_ret = in_a_cluster;
+	else
+		lwfree(in_a_cluster);
+
 	if (cxt.items_found)
 		lwfree(cxt.items_found);
 
@@ -337,13 +364,16 @@
 	return success;
 }
 
+/** Takes an array of LWGEOM* and constructs an array of LWGEOM*, where each element in the constructed array is a
+ *  GeometryCollection representing a set of geometries separated by no more than the specified tolerance. Caller is
+ *  responsible for freeing the input array, but not the LWGEOM* items inside it. */
 int
-cluster_dbscan(LWGEOM** geoms, uint32_t num_geoms, double eps, uint32_t min_points, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
+cluster_within_distance(LWGEOM** geoms, uint32_t num_geoms, double tolerance, LWGEOM*** clusterGeoms, uint32_t* num_clusters)
 {
 	int cluster_success;
 	UNIONFIND* uf = UF_create(num_geoms);
 
-	if (union_dbscan(geoms, num_geoms, uf, eps, min_points) == LW_FAILURE)
+	if (union_dbscan(geoms, num_geoms, uf, tolerance, 1, NULL) == LW_FAILURE)
 	{
 		UF_destroy(uf);
 		return LW_FAILURE;

Modified: trunk/liblwgeom/lwunionfind.c
===================================================================
--- trunk/liblwgeom/lwunionfind.c	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwunionfind.c	2016-06-06 12:24:32 UTC (rev 14934)
@@ -142,25 +142,25 @@
 }
 
 uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, const char* is_in_cluster)
 {
 	uint32_t* ordered_components = UF_ordered_by_cluster(uf);
 	uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
 	uint32_t last_old_id, current_new_id, i;
+	char encountered_cluster = LW_FALSE;
 
-	last_old_id = UF_find(uf, ordered_components[0]);
-	current_new_id = 1;
+	current_new_id = 0;
 	for (i = 0; i < uf->N; i++)
 	{
 		uint32_t j = ordered_components[i];
-
-		if (UF_size(uf, j) < min_cluster_size)
+		if (!is_in_cluster || is_in_cluster[j])
 		{
-			new_ids[j] = noise_cluster_id;
-		}
-		else
-		{
 			uint32_t current_old_id = UF_find(uf, j);
+			if (!encountered_cluster)
+			{
+				encountered_cluster = LW_TRUE;
+				last_old_id = current_old_id;
+			}
 
 			if (current_old_id != last_old_id)
 				current_new_id++;

Modified: trunk/liblwgeom/lwunionfind.h
===================================================================
--- trunk/liblwgeom/lwunionfind.h	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/liblwgeom/lwunionfind.h	2016-06-06 12:24:32 UTC (rev 14934)
@@ -56,9 +56,9 @@
 uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
 
 /* Replace the cluster ids in a UNIONFIND with sequential ids starting at one. 
- * If min_cluster_size is greater than 1, any clusters with fewer than
- * min_cluster_size items will be assigned to noise_cluster_id.
+ * If is_in_cluster array is provided, it will be used to skip any indexes
+ * that are not in a cluster.
  * */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, const char* is_in_cluster);
 
 #endif

Modified: trunk/postgis/lwgeom_geos.c
===================================================================
--- trunk/postgis/lwgeom_geos.c	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/postgis/lwgeom_geos.c	2016-06-06 12:24:32 UTC (rev 14934)
@@ -3191,7 +3191,6 @@
 	LWGEOM** lw_inputs;
 	LWGEOM** lw_results;
 	double tolerance;
-	int min_points;
 	int srid=SRID_UNKNOWN;
 
 	/* Parameters used to construct a result array */
@@ -3212,17 +3211,6 @@
 		PG_RETURN_NULL();
 	}
 
-	if (PG_NARGS() < 3)
-		min_points = 1;
-	else
-		min_points = PG_GETARG_INT32(2);
-
-	if (min_points < 1)
-	{
-		lwpgerror("min_points must be a positive number.");
-		PG_RETURN_NULL();
-	}
-
 	nelems = array_nelems_not_null(array);
 
 	POSTGIS_DEBUGF(3, "cluster_within_distance_garray: number of non-null elements: %d", nelems);
@@ -3240,7 +3228,7 @@
 		PG_RETURN_NULL();
 	}
 
-	if (cluster_dbscan(lw_inputs, nelems, tolerance, min_points, &lw_results, &nclusters) != LW_SUCCESS)
+	if (cluster_within_distance(lw_inputs, nelems, tolerance, &lw_results, &nclusters) != LW_SUCCESS)
 	{
 		elog(ERROR, "cluster_within: Error performing clustering");
 		PG_RETURN_NULL();

Modified: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/postgis/lwgeom_window.c	2016-06-06 12:24:32 UTC (rev 14934)
@@ -92,6 +92,7 @@
 		uint32_t i;
 		uint32_t* result_ids;
 		LWGEOM** geoms;
+		char* is_in_cluster;
 		UNIONFIND* uf;
 		bool tolerance_is_null;
 		bool minpoints_is_null;
@@ -127,7 +128,7 @@
 			}
 		}
 
-		if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints) == LW_SUCCESS)
+		if (union_dbscan(geoms, ngeoms, uf, tolerance, minpoints, &is_in_cluster) == LW_SUCCESS)
 			context->is_error = LW_FALSE;
 
 		for (i = 0; i < ngeoms; i++)
@@ -139,26 +140,21 @@
 		if (context->is_error)
 		{
 			UF_destroy(uf);
+			lwfree(is_in_cluster);
 			lwpgerror("Error during clustering");
 			PG_RETURN_NULL();
 		}
 
-		result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
+		result_ids = UF_get_collapsed_cluster_ids(uf, is_in_cluster);
 		for (i = 0; i < ngeoms; i++)
 		{
-			if (result_ids[i] == 0)
+			if (!is_in_cluster[i])
 			{
 				context->cluster_assignments[i].is_null = LW_TRUE;
 			}
 			else
 			{
-				/* We overloaded the zero cluster ID above to tag "noise" geometries
-				 * that are not part of any cluster.  Now that those have been
-				 * properly converted into NULLs, we need to subtract one from 
-				 * the collapsed cluster IDs so that we get a zero-based sequence, 
-				 * consistent with our good friend ST_ClusterKMeans.
-				 */
-				context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+				context->cluster_assignments[i].cluster_id = result_ids[i];
 			}
 		}
 

Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected	2016-06-04 18:18:37 UTC (rev 14933)
+++ trunk/regress/cluster_expected	2016-06-06 12:24:32 UTC (rev 14934)
@@ -24,6 +24,6 @@
 t103|1|
 t103|2|
 t103|3|
-t103|4|1
-t103|5|1
-t103|6|1
+t103|4|0
+t103|5|0
+t103|6|0



More information about the postgis-tickets mailing list