[postgis-tickets] r14928 - #3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster

Daniel Baston dbaston at gmail.com
Wed Jun 1 18:24:52 PDT 2016


Author: dbaston
Date: 2016-06-01 18:24:52 -0700 (Wed, 01 Jun 2016)
New Revision: 14928

Modified:
   trunk/doc/reference_measure.xml
   trunk/liblwgeom/cunit/cu_unionfind.c
   trunk/liblwgeom/lwunionfind.c
   trunk/liblwgeom/lwunionfind.h
   trunk/postgis/lwgeom_window.c
   trunk/regress/cluster_expected
Log:
#3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster

Modified: trunk/doc/reference_measure.xml
===================================================================
--- trunk/doc/reference_measure.xml	2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/doc/reference_measure.xml	2016-06-02 01:24:52 UTC (rev 14928)
@@ -1082,8 +1082,9 @@
           <ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink> 
           algorithm.  An input geometry will be added to a cluster if it is 
           within <varname>eps</varname> distance of at least
-          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require to specify the number of
-          clusters for each dataset, but instead uses the desired input distance <varname>eps</varname> at optimal cluster size for each cluster.
+          <varname>minpoints</varname> other input geometries.  Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
+          clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters 
+          to construct each cluster.  Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
       </para>
       <para>Availability: 2.3.0 - requires GEOS </para>
     </refsection>

Modified: trunk/liblwgeom/cunit/cu_unionfind.c
===================================================================
--- trunk/liblwgeom/cunit/cu_unionfind.c	2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/liblwgeom/cunit/cu_unionfind.c	2016-06-02 01:24:52 UTC (rev 14928)
@@ -125,14 +125,35 @@
 	uf->clusters[8] = 8;
 	uf->clusters[9] = 7;
 
+	uf->cluster_sizes[0] = 3;
+	uf->cluster_sizes[1] = 4;
+	uf->cluster_sizes[2] = 4;
+	uf->cluster_sizes[3] = 4;
+	uf->cluster_sizes[4] = 3;
+	uf->cluster_sizes[5] = 4;
+	uf->cluster_sizes[6] = 3;
+	uf->cluster_sizes[7] = 3;
+	uf->cluster_sizes[8] = 3;
+	uf->cluster_sizes[9] = 3;
+
 	/* 5 -> 0
 	 * 7 -> 1
 	 * 8 -> 2
 	 */
-	uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
-	uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf);
+	uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
+	uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
 
 	CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+
+	lwfree(collapsed_ids);
+
+	uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
+	collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+
+	CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+
+	lwfree(collapsed_ids);
+	UF_destroy(uf);
 }
 
 void unionfind_suite_setup(void);

Modified: trunk/liblwgeom/lwunionfind.c
===================================================================
--- trunk/liblwgeom/lwunionfind.c	2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/liblwgeom/lwunionfind.c	2016-06-02 01:24:52 UTC (rev 14928)
@@ -74,6 +74,12 @@
 	return i;
 }
 
+uint32_t
+UF_size (UNIONFIND* uf, uint32_t i)
+{
+    return uf->cluster_sizes[UF_find(uf, i)];
+}
+
 void
 UF_union(UNIONFIND* uf, uint32_t i, uint32_t j)
 {
@@ -136,24 +142,32 @@
 }
 
 uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
 {
 	uint32_t* ordered_components = UF_ordered_by_cluster(uf);
 	uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
 	uint32_t last_old_id, current_new_id, i;
 
 	last_old_id = UF_find(uf, ordered_components[0]);
-	current_new_id = 0;
+	current_new_id = 1;
 	for (i = 0; i < uf->N; i++)
 	{
 		uint32_t j = ordered_components[i];
-		uint32_t current_old_id = UF_find(uf, j);
 
-		if (current_old_id != last_old_id)
-			current_new_id++;
+		if (UF_size(uf, j) < min_cluster_size)
+		{
+			new_ids[j] = noise_cluster_id;
+		}
+		else
+		{
+			uint32_t current_old_id = UF_find(uf, j);
 
-		new_ids[j] = current_new_id;
-		last_old_id = current_old_id;
+			if (current_old_id != last_old_id)
+				current_new_id++;
+
+			new_ids[j] = current_new_id;
+			last_old_id = current_old_id;
+		}
 	}
 
 	lwfree(ordered_components);

Modified: trunk/liblwgeom/lwunionfind.h
===================================================================
--- trunk/liblwgeom/lwunionfind.h	2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/liblwgeom/lwunionfind.h	2016-06-02 01:24:52 UTC (rev 14928)
@@ -45,14 +45,20 @@
 /* Identify the cluster id associated with specified component id */
 uint32_t UF_find(UNIONFIND* uf, uint32_t i);
 
-/* Merge the clusters that contain the two specified components ids */
+/* Get the size of the cluster associated with the specified component id */
+uint32_t UF_size(UNIONFIND* uf, uint32_t i);
+
+/* Merge the clusters that contain the two specified component ids */
 void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j);
 
 /* Return an array of component ids, where components that are in the
  * same cluster are contiguous in the array */
 uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
 
-/* Replace the cluster ids in a UNIONFIND with sequential ids starting at zero. */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf);
+/* Replace the cluster ids in a UNIONFIND with sequential ids starting at one. 
+ * If min_cluster_size is greater than 1, any clusters with fewer than
+ * min_cluster_size items will be assigned to noise_cluster_id.
+ * */
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
 
 #endif

Modified: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c	2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/postgis/lwgeom_window.c	2016-06-02 01:24:52 UTC (rev 14928)
@@ -143,10 +143,23 @@
 			PG_RETURN_NULL();
 		}
 
-		result_ids = UF_get_collapsed_cluster_ids(uf);
+		result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
 		for (i = 0; i < ngeoms; i++)
 		{
-			context->cluster_assignments[i].cluster_id = result_ids[i];
+			if (result_ids[i] == 0)
+			{
+				context->cluster_assignments[i].is_null = LW_TRUE;
+			}
+			else
+			{
+				/* We overloaded the zero cluster ID above to tag "noise" geometries
+				 * that are not part of any cluster.  Now that those have been
+				 * properly converted into NULLs, we need to subtract one from 
+				 * the collapsed cluster IDs so that we get a zero-based sequence, 
+				 * consistent with our good friend ST_ClusterKMeans.
+				 */
+				context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+			}
 		}
 
 		lwfree(result_ids);

Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected	2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/regress/cluster_expected	2016-06-02 01:24:52 UTC (rev 14928)
@@ -15,15 +15,15 @@
 t101|4|1
 t101|5|1
 t101|6|1
-t102|1|0
-t102|2|1
-t102|3|2
-t102|4|3
-t102|5|4
-t102|6|5
-t103|1|0
-t103|2|1
-t103|3|2
-t103|4|3
-t103|5|3
-t103|6|3
+t102|1|
+t102|2|
+t102|3|
+t102|4|
+t102|5|
+t102|6|
+t103|1|
+t103|2|
+t103|3|
+t103|4|1
+t103|5|1
+t103|6|1



More information about the postgis-tickets mailing list