[postgis-tickets] r14928 - #3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster
Daniel Baston
dbaston at gmail.com
Wed Jun 1 18:24:52 PDT 2016
Author: dbaston
Date: 2016-06-01 18:24:52 -0700 (Wed, 01 Jun 2016)
New Revision: 14928
Modified:
trunk/doc/reference_measure.xml
trunk/liblwgeom/cunit/cu_unionfind.c
trunk/liblwgeom/lwunionfind.c
trunk/liblwgeom/lwunionfind.h
trunk/postgis/lwgeom_window.c
trunk/regress/cluster_expected
Log:
#3567, Make ST_ClusterDBSCAN return a NULL id for inputs not in any cluster
Modified: trunk/doc/reference_measure.xml
===================================================================
--- trunk/doc/reference_measure.xml 2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/doc/reference_measure.xml 2016-06-02 01:24:52 UTC (rev 14928)
@@ -1082,8 +1082,9 @@
<ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink>
algorithm. An input geometry will be added to a cluster if it is
within <varname>eps</varname> distance of at least
- <varname>minpoints</varname> other input geometries. Unlike <xref linkend="ST_ClusterKMeans" />, it does not require to specify the number of
- clusters for each dataset, but instead uses the desired input distance <varname>eps</varname> at optimal cluster size for each cluster.
+ <varname>minpoints</varname> other input geometries. Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of
+ clusters to be specified, but instead uses the desired distance (<varname>eps</varname>) and density(<varname>minpoints</varname>) parameters
+ to construct each cluster. Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
</para>
<para>Availability: 2.3.0 - requires GEOS </para>
</refsection>
Modified: trunk/liblwgeom/cunit/cu_unionfind.c
===================================================================
--- trunk/liblwgeom/cunit/cu_unionfind.c 2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/liblwgeom/cunit/cu_unionfind.c 2016-06-02 01:24:52 UTC (rev 14928)
@@ -125,14 +125,35 @@
uf->clusters[8] = 8;
uf->clusters[9] = 7;
+ uf->cluster_sizes[0] = 3;
+ uf->cluster_sizes[1] = 4;
+ uf->cluster_sizes[2] = 4;
+ uf->cluster_sizes[3] = 4;
+ uf->cluster_sizes[4] = 3;
+ uf->cluster_sizes[5] = 4;
+ uf->cluster_sizes[6] = 3;
+ uf->cluster_sizes[7] = 3;
+ uf->cluster_sizes[8] = 3;
+ uf->cluster_sizes[9] = 3;
+
/* 5 -> 0
* 7 -> 1
* 8 -> 2
*/
- uint32_t expected_collapsed_ids[] = { 2, 0, 0, 0, 1, 0, 2, 1, 2, 1 };
- uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf);
+ uint32_t expected_collapsed_ids[] = { 3, 1, 1, 1, 2, 1, 3, 2, 3, 2 };
+ uint32_t* collapsed_ids = UF_get_collapsed_cluster_ids(uf, 1, 0);
CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids, 10*sizeof(uint32_t)));
+
+ lwfree(collapsed_ids);
+
+ uint32_t expected_collapsed_ids2[] = { 999, 1, 1, 1, 999, 1, 999, 999, 999, 999 };
+ collapsed_ids = UF_get_collapsed_cluster_ids(uf, 4, 999);
+
+ CU_ASSERT_EQUAL(0, memcmp(collapsed_ids, expected_collapsed_ids2, 10*sizeof(uint32_t)));
+
+ lwfree(collapsed_ids);
+ UF_destroy(uf);
}
void unionfind_suite_setup(void);
Modified: trunk/liblwgeom/lwunionfind.c
===================================================================
--- trunk/liblwgeom/lwunionfind.c 2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/liblwgeom/lwunionfind.c 2016-06-02 01:24:52 UTC (rev 14928)
@@ -74,6 +74,12 @@
return i;
}
+uint32_t
+UF_size (UNIONFIND* uf, uint32_t i)
+{
+ return uf->cluster_sizes[UF_find(uf, i)];
+}
+
void
UF_union(UNIONFIND* uf, uint32_t i, uint32_t j)
{
@@ -136,24 +142,32 @@
}
uint32_t*
-UF_get_collapsed_cluster_ids(UNIONFIND* uf)
+UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id)
{
uint32_t* ordered_components = UF_ordered_by_cluster(uf);
uint32_t* new_ids = lwalloc(uf->N * sizeof(uint32_t));
uint32_t last_old_id, current_new_id, i;
last_old_id = UF_find(uf, ordered_components[0]);
- current_new_id = 0;
+ current_new_id = 1;
for (i = 0; i < uf->N; i++)
{
uint32_t j = ordered_components[i];
- uint32_t current_old_id = UF_find(uf, j);
- if (current_old_id != last_old_id)
- current_new_id++;
+ if (UF_size(uf, j) < min_cluster_size)
+ {
+ new_ids[j] = noise_cluster_id;
+ }
+ else
+ {
+ uint32_t current_old_id = UF_find(uf, j);
- new_ids[j] = current_new_id;
- last_old_id = current_old_id;
+ if (current_old_id != last_old_id)
+ current_new_id++;
+
+ new_ids[j] = current_new_id;
+ last_old_id = current_old_id;
+ }
}
lwfree(ordered_components);
Modified: trunk/liblwgeom/lwunionfind.h
===================================================================
--- trunk/liblwgeom/lwunionfind.h 2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/liblwgeom/lwunionfind.h 2016-06-02 01:24:52 UTC (rev 14928)
@@ -45,14 +45,20 @@
/* Identify the cluster id associated with specified component id */
uint32_t UF_find(UNIONFIND* uf, uint32_t i);
-/* Merge the clusters that contain the two specified components ids */
+/* Get the size of the cluster associated with the specified component id */
+uint32_t UF_size(UNIONFIND* uf, uint32_t i);
+
+/* Merge the clusters that contain the two specified component ids */
void UF_union(UNIONFIND* uf, uint32_t i, uint32_t j);
/* Return an array of component ids, where components that are in the
* same cluster are contiguous in the array */
uint32_t* UF_ordered_by_cluster(UNIONFIND* uf);
-/* Replace the cluster ids in a UNIONFIND with sequential ids starting at zero. */
-uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf);
+/* Replace the cluster ids in a UNIONFIND with sequential ids starting at one.
+ * If min_cluster_size is greater than 1, any clusters with fewer than
+ * min_cluster_size items will be assigned to noise_cluster_id.
+ * */
+uint32_t* UF_get_collapsed_cluster_ids(UNIONFIND* uf, uint32_t min_cluster_size, uint32_t noise_cluster_id);
#endif
Modified: trunk/postgis/lwgeom_window.c
===================================================================
--- trunk/postgis/lwgeom_window.c 2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/postgis/lwgeom_window.c 2016-06-02 01:24:52 UTC (rev 14928)
@@ -143,10 +143,23 @@
PG_RETURN_NULL();
}
- result_ids = UF_get_collapsed_cluster_ids(uf);
+ result_ids = UF_get_collapsed_cluster_ids(uf, minpoints, 0);
for (i = 0; i < ngeoms; i++)
{
- context->cluster_assignments[i].cluster_id = result_ids[i];
+ if (result_ids[i] == 0)
+ {
+ context->cluster_assignments[i].is_null = LW_TRUE;
+ }
+ else
+ {
+ /* We overloaded the zero cluster ID above to tag "noise" geometries
+ * that are not part of any cluster. Now that those have been
+ * properly converted into NULLs, we need to subtract one from
+ * the collapsed cluster IDs so that we get a zero-based sequence,
+ * consistent with our good friend ST_ClusterKMeans.
+ */
+ context->cluster_assignments[i].cluster_id = result_ids[i] - 1;
+ }
}
lwfree(result_ids);
Modified: trunk/regress/cluster_expected
===================================================================
--- trunk/regress/cluster_expected 2016-06-01 14:32:41 UTC (rev 14927)
+++ trunk/regress/cluster_expected 2016-06-02 01:24:52 UTC (rev 14928)
@@ -15,15 +15,15 @@
t101|4|1
t101|5|1
t101|6|1
-t102|1|0
-t102|2|1
-t102|3|2
-t102|4|3
-t102|5|4
-t102|6|5
-t103|1|0
-t103|2|1
-t103|3|2
-t103|4|3
-t103|5|3
-t103|6|3
+t102|1|
+t102|2|
+t102|3|
+t102|4|
+t102|5|
+t102|6|
+t103|1|
+t103|2|
+t103|3|
+t103|4|1
+t103|5|1
+t103|6|1
More information about the postgis-tickets
mailing list