[GRASS-SVN] r64017 - grass/trunk/vector/v.cluster
svn_grass at osgeo.org
svn_grass at osgeo.org
Thu Jan 8 14:28:02 PST 2015
Author: mmetz
Date: 2015-01-08 14:28:02 -0800 (Thu, 08 Jan 2015)
New Revision: 64017
Modified:
grass/trunk/vector/v.cluster/main.c
grass/trunk/vector/v.cluster/v.cluster.html
Log:
v.cluster: add methods dbscan2,density,optics,optics2
Modified: grass/trunk/vector/v.cluster/main.c
===================================================================
--- grass/trunk/vector/v.cluster/main.c 2015-01-08 22:05:12 UTC (rev 64016)
+++ grass/trunk/vector/v.cluster/main.c 2015-01-08 22:28:02 UTC (rev 64017)
@@ -18,30 +18,65 @@
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <grass/gis.h>
#include <grass/vector.h>
#include <grass/glocale.h>
#include <grass/kdtree.h>
+#ifdef MAX
+#undef MAX
+#endif
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define CL_DBSCAN 1
+#define CL_DBSCAN2 2
+#define CL_DENSE 3
+#define CL_OPTICS 4
+#define CL_OPTICS2 5
+
+#define GET_PARENT(p, c) ((p) = (int) (((c) - 2) / 3 + 1))
+#define GET_CHILD(c, p) ((c) = (int) (((p) * 3) - 1))
+
+
+struct cl_pnt
+{
+ int uid;
+ int prevpnt;
+ double cd;
+ double reach;
+ double c[3];
+};
+
+static struct cl_pnt *clp;
+
+static int *heapidx;
+static int heapsize;
+
+int add_pt(int idx);
+int drop_pt(void);
+
int main(int argc, char *argv[])
{
struct Map_info In, Out;
- static struct line_pnts *Points;
+ struct line_pnts *Points;
struct line_cats *Cats;
int i, j, type, cat, is3d;
struct GModule *module;
- struct Option *input, *output, *dist_opt, *min_opt, *lyr_opt;
+ struct Option *input, *output, *lyr_opt;
+ struct Option *dist_opt, *min_opt, *method_opt;
struct Flag *flag_2d, *flag_topo, *flag_attr;
int clayer;
int npoints, nlines;
int *cid, *idx, *renumber, OLD, NEW;
- int nclusters;
+ int nclusters, noutliers;
struct kdtree *kdt;
struct kdtrav trav;
double c[3];
int uid;
double eps;
int ndims, minpnts;
+ int clmethod;
double *kddist;
int kdfound, *kduid;
@@ -78,6 +113,14 @@
min_opt->required = NO;
min_opt->label = _("Minimum number of points to create a cluster");
+ method_opt = G_define_option();
+ method_opt->type = TYPE_STRING;
+ method_opt->key = "method";
+ method_opt->options = "dbscan,dbscan2,density,optics,optics2";
+ method_opt->answer = "dbscan";
+ method_opt->required = NO;
+ method_opt->label = _("Clustering method");
+
flag_2d = G_define_flag();
flag_2d->key = '2';
flag_2d->label = _("Force 2D clustering");
@@ -125,18 +168,34 @@
}
clayer = atoi(lyr_opt->answer);
+ if (clayer < 1)
+ G_fatal_error(_("Option %s must be positive"), lyr_opt->key);
+ clmethod = CL_DBSCAN;
+ if (!strcmp(method_opt->answer, "dbscan2"))
+ clmethod = CL_DBSCAN2;
+ else if (!strcmp(method_opt->answer, "density"))
+ clmethod = CL_DENSE;
+ else if (!strcmp(method_opt->answer, "optics"))
+ clmethod = CL_OPTICS;
+ else if (!strcmp(method_opt->answer, "optic2"))
+ clmethod = CL_OPTICS2;
+
/* count points */
G_message(_("Counting input points ..."));
npoints = nlines = 0;
while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
nlines++;
- if (type == GV_POINT)
+ if (type == GV_POINT) {
+ if (Vect_cat_get(Cats, clayer, &cat))
+ G_fatal_error(_("Layer %d is not empty, choose another layer"),
+ clayer);
npoints++;
+ }
}
- if (npoints == 0) {
- G_warning(_("No points in input, nothing to do"));
+ if (npoints < minpnts + 1) {
+ G_warning(_("Not enough points in input, nothing to do"));
Vect_close(&In);
exit(EXIT_SUCCESS);
}
@@ -160,6 +219,7 @@
Vect_rewind(&In);
i = 0;
cid[0] = 0;
+ idx[0] = 0;
while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
G_percent(i++, nlines, 4);
cid[i] = 0;
@@ -176,178 +236,883 @@
kdtree_optimize(kdt, 2);
- /* get epsilon */
- if (dist_opt->answer) {
- eps = atof(dist_opt->answer);
- if (eps <= 0)
- G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
- }
- else {
- int n;
- double dist, mean, min, max, sum, sumsq, sd;
- double *kd;
- int *ki;
+ noutliers = nclusters = 0;
+ if (clmethod == CL_DBSCAN) {
+ /* DBSCAN
+ * the neighbors of each point with at least minpnts neighbors
+ * are added to a cluster */
- /* estimate epsilon */
- G_message(_("Estimating maximum distance ..."));
+ /* get epsilon */
+ if (dist_opt->answer) {
+ eps = atof(dist_opt->answer);
+ if (eps <= 0)
+ G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
+ }
+ else {
+ int n;
+ double dist, mean, min, max, sum, sumsq, sd;
+ double *kd;
+ int *ki;
+
+ /* estimate epsilon */
+ G_message(_("Estimating maximum distance ..."));
+ kdtree_init_trav(&trav, kdt);
+ c[2] = 0.0;
+ n = 0;
+ sum = sumsq = 0;
+ min = 1.0 / 0.0;
+ max = 0;
+ kd = G_malloc(minpnts * sizeof(double));
+ ki = G_malloc(minpnts * sizeof(int));
+ i = 0;
+ while (kdtree_traverse(&trav, c, &uid)) {
+ G_percent(i++, npoints, 4);
+
+ kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+ if (kdfound) {
+ dist = sqrt(kd[kdfound - 1]);
+ sum += dist;
+ sumsq += dist * dist;
+ n++;
+ if (min > dist)
+ min = dist;
+ if (max < dist)
+ max = dist;
+ }
+ }
+ G_percent(npoints, npoints, 4);
+
+ G_free(kd);
+ G_free(ki);
+
+ if (!n)
+ G_fatal_error(_("No neighbors found"));
+
+ mean = sum / n;
+ sd = sqrt(sumsq / n - mean * mean);
+ eps = mean + 1.644854 * sd; /* 90% CI */
+ eps = mean + 2.575829 * sd; /* 99% CI */
+
+ if (eps > max)
+ eps = max;
+
+ G_message(_("Distance to the %d nearest neighbor:"), minpnts);
+ G_message(_("Min: %g, max: %g"), min, max);
+ G_message(_("Mean: %g"), mean);
+ G_message(_("Standard deviation: %g"), sd);
+
+ G_message(_("Estimated maximum distance: %g"), eps);
+ }
+
+ /* create clusters */
+ G_message(_("Building clusters ..."));
+ nclusters = 0;
kdtree_init_trav(&trav, kdt);
c[2] = 0.0;
- n = 0;
- sum = sumsq = 0;
- min = 1.0 / 0.0;
- max = 0;
- kd = G_malloc(minpnts * sizeof(double));
- ki = G_malloc(minpnts * sizeof(int));
+ idx[0] = 0;
i = 0;
while (kdtree_traverse(&trav, c, &uid)) {
G_percent(i++, npoints, 4);
- kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+ /* radius search */
+ kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
+
+ /* must have min neighbors within radius */
+ if (kdfound >= minpnts) {
+
+ cat = idx[cid[uid]];
+
+ /* find latest cluster */
+ for (j = 0; j < kdfound; j++) {
+ if (cat < idx[cid[kduid[j]]]) {
+ cat = idx[cid[kduid[j]]];
+ }
+ }
+
+ if (cat == 0) {
+ /* start new cluster */
+ nclusters++;
+ cat = nclusters;
+ if (nclusters > nlines)
+ G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
+ idx[nclusters] = nclusters;
+ cid[uid] = nclusters;
+ }
+
+ /* set or update cluster ids */
+ if (cid[uid] != 0) {
+ /* relabel */
+ idx[cid[uid]] = cat;
+ }
+ else {
+ cid[uid] = cat;
+ }
+
+ for (j = 0; j < kdfound; j++) {
+ if (cid[kduid[j]] != 0) {
+ /* relabel */
+ idx[cid[kduid[j]]] = cat;
+ }
+ else {
+ cid[kduid[j]] = cat;
+ }
+ }
+ }
if (kdfound) {
- dist = sqrt(kd[kdfound - 1]);
- sum += dist;
- sumsq += dist * dist;
- n++;
- if (min > dist)
- min = dist;
- if (max < dist)
- max = dist;
+ G_free(kddist);
+ G_free(kduid);
}
}
G_percent(npoints, npoints, 4);
- G_free(kd);
- G_free(ki);
+ if (nclusters == 0) {
+ G_message(_("No clusters found, adjust option %s"), dist_opt->key);
+ Vect_close(&In);
+ Vect_close(&Out);
+ Vect_delete(output->answer);
+ exit(EXIT_SUCCESS);
+ }
- if (!n)
- G_fatal_error(_("No neighbors found"));
-
- mean = sum / n;
- sd = sqrt(sumsq / n - mean * mean);
- eps = mean + 1.644854 * sd; /* 90% CI */
- eps = mean + 2.575829 * sd; /* 99% CI */
-
- if (eps > max)
- eps = max;
+ /* generate a renumbering scheme */
+ G_message(_("Generating renumbering scheme..."));
+ G_debug(1, "%d initial clusters", nclusters);
+ /* allocate final clump ID */
+ renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
+ renumber[0] = 0;
+ cat = 1;
+ G_percent(0, nclusters, 1);
+ for (i = 1; i <= nclusters; i++) {
+ G_percent(i, nclusters, 4);
+ OLD = i;
+ NEW = idx[i];
+ if (OLD != NEW) {
+ renumber[i] = 0;
+ /* find valid clump ID */
+ while (OLD != NEW) {
+ OLD = NEW;
+ NEW = idx[OLD];
+ }
+ idx[i] = NEW;
+ }
+ else
+ /* set final clump id */
+ renumber[i] = cat++;
+ }
- G_message(_("Distance to the %d nearest neighbor:"), minpnts);
- G_message(_("Min: %g, max: %g"), min, max);
- G_message(_("Mean: %g"), mean);
- G_message(_("Standard deviation: %g"), sd);
+ nclusters = cat - 1;
- G_message(_("Estimated maximum distance: %g"), eps);
+ /* write cluster ids */
+ G_message(_("Write out cluster ids ..."));
+ Vect_rewind(&In);
+ i = 0;
+ noutliers = 0;
+ while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+ G_percent(i++, nlines, 4);
+ if (type == GV_POINT) {
+ cat = renumber[idx[cid[i]]];
+ if (!cat)
+ noutliers++;
+ Vect_cat_set(Cats, clayer, cat);
+ Vect_write_line(&Out, GV_POINT, Points, Cats);
+ }
+ }
+ G_percent(nlines, nlines, 4);
}
+ else if (clmethod == CL_DBSCAN2) {
+ /* DBSCAN, but cluster size must be at least minpnts + 1 */
+ int *clcnt;
- /* create clusters */
- G_message(_("Building clusters ..."));
- nclusters = 0;
- kdtree_init_trav(&trav, kdt);
- c[2] = 0.0;
- idx[0] = 0;
- i = 0;
- while (kdtree_traverse(&trav, c, &uid)) {
- G_percent(i++, npoints, 4);
+ /* get epsilon */
+ if (dist_opt->answer) {
+ eps = atof(dist_opt->answer);
+ if (eps <= 0)
+ G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
+ }
+ else {
+ int n;
+ double dist, mean, min, max, sum, sumsq, sd;
+ double *kd;
+ int *ki;
- /* radius search */
- kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
+ /* estimate epsilon */
+ G_message(_("Estimating maximum distance ..."));
+ kdtree_init_trav(&trav, kdt);
+ c[2] = 0.0;
+ n = 0;
+ sum = sumsq = 0;
+ min = 1.0 / 0.0;
+ max = 0;
+ kd = G_malloc(minpnts * sizeof(double));
+ ki = G_malloc(minpnts * sizeof(int));
+ i = 0;
+ while (kdtree_traverse(&trav, c, &uid)) {
+ G_percent(i++, npoints, 4);
+
+ kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+ if (kdfound) {
+ dist = sqrt(kd[kdfound - 1]);
+ sum += dist;
+ sumsq += dist * dist;
+ n++;
+ if (min > dist)
+ min = dist;
+ if (max < dist)
+ max = dist;
+ }
+ }
+ G_percent(npoints, npoints, 4);
+
+ G_free(kd);
+ G_free(ki);
+
+ if (!n)
+ G_fatal_error(_("No neighbors found"));
+
+ mean = sum / n;
+ sd = sqrt(sumsq / n - mean * mean);
+ eps = mean + 1.644854 * sd; /* 90% CI */
+ eps = mean + 2.575829 * sd; /* 99% CI */
+
+ if (eps > max)
+ eps = max;
+
+ G_message(_("Distance to the %d nearest neighbor:"), minpnts);
+ G_message(_("Min: %g, max: %g"), min, max);
+ G_message(_("Mean: %g"), mean);
+ G_message(_("Standard deviation: %g"), sd);
+
+ G_message(_("Estimated maximum distance: %g"), eps);
+ }
+
+ /* create clusters */
+ G_message(_("Building clusters ..."));
+ clcnt = G_malloc((nlines + 1) * sizeof(int));
+ for (i = 0; i <= nlines; i++)
+ clcnt[i] = 0;
+ nclusters = 0;
+ kdtree_init_trav(&trav, kdt);
+ c[2] = 0.0;
+ idx[0] = 0;
+ i = 0;
+ while (kdtree_traverse(&trav, c, &uid)) {
+ G_percent(i++, npoints, 4);
+
+ /* radius search */
+ kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
+
+ /* any neighbor within radius */
+ if (kdfound > 0) {
+
+ cat = idx[cid[uid]];
+
+ /* find latest cluster */
+ for (j = 0; j < kdfound; j++) {
+ if (cat < idx[cid[kduid[j]]]) {
+ cat = idx[cid[kduid[j]]];
+ }
+ }
+
+ if (cat == 0) {
+ /* start new cluster */
+ nclusters++;
+ cat = nclusters;
+ if (nclusters > nlines)
+ G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
+ idx[nclusters] = nclusters;
+ cid[uid] = nclusters;
+ clcnt[cat] = 1;
+ }
+
+ /* set or update cluster ids */
+ if (cid[uid] != 0) {
+ /* relabel */
+ idx[cid[uid]] = cat;
+ }
+ else {
+ cid[uid] = cat;
+ clcnt[cat]++;
+ }
+
+ for (j = 0; j < kdfound; j++) {
+ if (cid[kduid[j]] != 0) {
+ /* relabel */
+ idx[cid[kduid[j]]] = cat;
+ }
+ else {
+ cid[kduid[j]] = cat;
+ clcnt[cat]++;
+ }
+ }
+ G_free(kddist);
+ G_free(kduid);
+ }
+ }
+ G_percent(npoints, npoints, 4);
+
+ if (nclusters == 0) {
+ G_message(_("No clusters found, adjust option %s"), dist_opt->key);
+ Vect_close(&In);
+ Vect_close(&Out);
+ Vect_delete(output->answer);
+ exit(EXIT_SUCCESS);
+ }
+
+ /* generate a renumbering scheme */
+ G_message(_("Generating renumbering scheme..."));
+ G_debug(1, "%d initial clusters", nclusters);
+ /* allocate final clump ID */
+ renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
+ renumber[0] = 0;
+ cat = 1;
+ G_percent(0, nclusters, 1);
+ for (i = 1; i <= nclusters; i++) {
+ G_percent(i, nclusters, 4);
+ OLD = i;
+ NEW = idx[i];
+ if (OLD != NEW) {
+ /* find valid clump ID */
+ while (OLD != NEW) {
+ OLD = NEW;
+ NEW = idx[OLD];
+ }
+ idx[i] = NEW;
+ clcnt[NEW] += clcnt[i];
+ }
+ }
+ for (i = 1; i <= nclusters; i++) {
+ OLD = i;
+ NEW = idx[i];
+ renumber[i] = 0;
+ if (OLD == NEW && clcnt[NEW] > minpnts) {
+ /* set final clump id */
+ renumber[i] = cat++;
+ }
+ }
+
+ nclusters = cat - 1;
+
+ /* write cluster ids */
+ G_message(_("Write out cluster ids ..."));
+ Vect_rewind(&In);
+ i = 0;
+ noutliers = 0;
+ while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+ G_percent(i++, nlines, 4);
+ if (type == GV_POINT) {
+ cat = renumber[idx[cid[i]]];
+ if (!cat)
+ noutliers++;
+ Vect_cat_set(Cats, clayer, cat);
+ Vect_write_line(&Out, GV_POINT, Points, Cats);
+ }
+ }
+ G_percent(nlines, nlines, 4);
+ }
+ else if (clmethod == CL_OPTICS) {
+ /* OPTICS
+ * each pair of points is either directly connected or
+ * connected by a chain of other points
+ * for each unprocessed point p
+ * mark as processed, append to output list
+ * core distance of p: distance to the k-th neighbor
+ * q: neighbor of p
+ * reachability of q: max(dist(p, q), coredist(p))
+ * -> needs epsilon, otherwise always coredist(p)
+ * for each unprocessed neighbor q
+ * if q has not been reached yet, put q in min heap
+ * if q's reachability can be reduced, put q with new reachability in min heap
+ * proceed with point with smallest reachability
+ * clusters:
+ * plot x = position in output list, y = reachability
+ * clusters = valleys of reachability in plot
+ * hierarchical clusters: valleys in valleys
+ */
+
+ double *kd;
+ int *ki;
+ int k, kdpnts;
+ int *clidx;
+ int *olist, nout;
+ double newrd;
+ int isout;
+
+ kd = G_malloc(minpnts * sizeof(double));
+ ki = G_malloc(minpnts * sizeof(int));
+
+ clp = G_malloc((npoints + 1) * sizeof(struct cl_pnt));
+ heapidx = G_malloc((npoints + 1) * sizeof(int));
+ olist = G_malloc((npoints + 1) * sizeof(int));
+ clidx = G_malloc((nlines + 1) * sizeof(int));
- /* must have min neighbors within radius */
- if (kdfound >= minpnts) {
+ heapsize = 0;
- cat = idx[cid[uid]];
+ /* get epsilon */
+ eps = 0;
+ if (dist_opt->answer) {
+ eps = atof(dist_opt->answer);
+ if (eps <= 0)
+ G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
+ }
- /* find latest cluster */
+ /* loading points */
+ G_message(_("Loading points ..."));
+ kdtree_init_trav(&trav, kdt);
+ c[2] = 0.0;
+ i = 0;
+ while (kdtree_traverse(&trav, c, &uid)) {
+ G_percent(i, npoints, 4);
+
+ clp[i].c[0] = c[0];
+ clp[i].c[1] = c[1];
+ clp[i].c[2] = c[2];
+ clp[i].uid = uid;
+ clp[i].cd = -1;
+ clp[i].reach = -1;
+ clp[i].prevpnt = -1;
+ clidx[uid] = i;
+ olist[i] = -1;
+
+ i++;
+ }
+ G_percent(npoints, npoints, 4);
+ kdpnts = i;
+ G_debug(0, "%d points in k-d tree", kdpnts);
+
+ /* reachability network */
+ G_message(_("Reachability network ..."));
+ nout = 0;
+ for (i = 0; i < kdpnts; i++) {
+ G_percent(i, kdpnts, 4);
+
+ if (clp[i].cd > 0)
+ continue;
+
+ /* knn search */
+ uid = clp[i].uid;
+ kdfound = kdtree_knn(kdt, clp[i].c, ki, kd, minpnts, &uid);
+ if (kdfound < minpnts)
+ G_fatal_error(_("Not enough points found"));
+
+ clp[i].cd = kd[minpnts - 1];
+ /* no reachability for the seed point !!! */
+ clp[i].reach = clp[i].cd; /* ok ? */
+ olist[nout++] = i;
+
+ /* initialize heap */
+ newrd = clp[i].cd;
for (j = 0; j < kdfound; j++) {
- if (idx[cid[kduid[j]]] > cat) {
- cat = idx[cid[kduid[j]]];
+ if (clp[clidx[ki[j]]].cd < 0) {
+ /* deviation from OPTICS,
+ * creates nicer connectivity graph */
+ newrd = kd[j];
+ if (clp[clidx[ki[j]]].reach < 0 || clp[clidx[ki[j]]].reach > newrd) {
+ clp[clidx[ki[j]]].reach = newrd;
+ clp[clidx[ki[j]]].prevpnt = i;
+ add_pt(clidx[ki[j]]);
+ }
}
}
- if (cat == 0) {
- /* start new cluster */
- nclusters++;
- cat = nclusters;
- if (nclusters > nlines)
- G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
- idx[nclusters] = nclusters;
- cid[uid] = nclusters;
+ while (heapsize) {
+ k = drop_pt();
+ if (k < 0 || k >= kdpnts)
+ G_fatal_error("Invalid index");
+ if (clp[k].cd > 0)
+ continue;
+
+ /* knn search */
+ uid = clp[k].uid;
+ kdfound = kdtree_knn(kdt, clp[k].c, ki, kd, minpnts, &uid);
+ if (kdfound < minpnts)
+ G_fatal_error(_("Not enough points found"));
+
+ clp[k].cd = kd[minpnts - 1];
+ olist[nout++] = k;
+
+ newrd = clp[k].cd;
+ for (j = 0; j < kdfound; j++) {
+ if (heapsize >= npoints)
+ G_fatal_error("Heap is too large");
+ if (clp[clidx[ki[j]]].cd < 0) {
+ /* deviation from OPTICS,
+ * creates nicer connectivity graph */
+ newrd = kd[j];
+ if (clp[clidx[ki[j]]].reach < 0 || clp[clidx[ki[j]]].reach > newrd) {
+ clp[clidx[ki[j]]].reach = newrd;
+ clp[clidx[ki[j]]].prevpnt = k;
+ add_pt(clidx[ki[j]]);
+ }
+ }
+ }
}
+ }
+ G_percent(kdpnts, kdpnts, 4);
+ G_debug(0, "nout: %d", nout);
+ if (nout != kdpnts)
+ G_fatal_error("nout != kdpnts");
- /* set or update cluster ids */
- if (cid[uid] != 0) {
- /* relabel */
- idx[cid[uid]] = cat;
- }
+ /* set cluster ids */
+ G_message(_("Set cluster ids ..."));
+ isout = 1;
+ nclusters = 0;
+ for (i = 0; i < kdpnts; i++) {
+ G_percent(i, kdpnts, 4);
+
+ if (eps > 0 && clp[olist[i]].reach > eps)
+ isout = 1;
else {
- cid[uid] = cat;
+ if (isout || clp[olist[i]].prevpnt == -1) {
+ isout = 0;
+ nclusters++;
+ }
+ cid[clp[olist[i]].uid] = nclusters;
}
+ }
+ G_percent(kdpnts, kdpnts, 4);
+ /* write cluster ids */
+ G_message(_("Write out cluster ids ..."));
+ Vect_rewind(&In);
+ i = 0;
+ noutliers = 0;
+ while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+ G_percent(i++, nlines, 4);
+ if (type == GV_POINT) {
+ cat = cid[i];
+ if (!cat)
+ noutliers++;
+ Vect_cat_set(Cats, clayer, cat);
+ Vect_write_line(&Out, GV_POINT, Points, Cats);
+ }
+ }
+ G_percent(nlines, nlines, 4);
+ }
+ else if (clmethod == CL_OPTICS2) {
+ /* OPTICS modified, create separated reachability networks
+ * for each point p
+ * get p's core distance
+ * get p's neighbors
+ * for each neighbor q
+ * new reachability of q: dist(p, q)
+ * if q has been processed
+ * new reachability of q: max(coredist(q), dist(p, q))
+ * set or reduce q's reachability
+ * connect q to p if q's reachability can be updated
+ */
+
+ double *coredist;
+ double *reachability;
+ int *nextpnt;
+ double *kd;
+ int *ki;
+ double newrd;
+
+ coredist = G_malloc((nlines + 1) * sizeof(double));
+ reachability = G_malloc((nlines + 1) * sizeof(double));
+ nextpnt = G_malloc((nlines + 1) * sizeof(int));
+
+ kd = G_malloc(minpnts * sizeof(double));
+ ki = G_malloc(minpnts * sizeof(int));
+
+ for (i = 0; i <= nlines; i++) {
+ coredist[i] = -1;
+ reachability[i] = -1;
+ nextpnt[i] = -1;
+ cid[i] = 0;
+ idx[i] = 0;
+ }
+
+ /* reachability network */
+ G_message(_("Reachability network ..."));
+ kdtree_init_trav(&trav, kdt);
+ c[2] = 0.0;
+ i = 0;
+ while (kdtree_traverse(&trav, c, &uid)) {
+ G_percent(i++, npoints, 4);
+
+ /* knn search */
+ kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+ if (kdfound < minpnts)
+ G_fatal_error(_("Not enough points found"));
+ coredist[uid] = kd[minpnts - 1];
+
for (j = 0; j < kdfound; j++) {
- if (cid[kduid[j]] != 0) {
- /* relabel */
- idx[cid[kduid[j]]] = cat;
+ /* new reachability */
+ newrd = kd[j];
+ if (coredist[ki[j]] > kd[j]) {
+ /* do not connect a point to its own cluster
+ * because points in its own cluster
+ * have already been connected to this point
+ * or reconnected */
+ newrd = coredist[ki[j]];
}
- else {
- cid[kduid[j]] = cat;
+
+ if (reachability[ki[j]] == -1 || reachability[ki[j]] > newrd) {
+ reachability[ki[j]] = newrd;
+ nextpnt[ki[j]] = uid;
+
+ /* no link - back link */
+ if (nextpnt[uid] == ki[j]) {
+ if (coredist[ki[j]] == -1) {
+ G_fatal_error(_("Neighbor point's core dist is -1"));
+ }
+ if (coredist[ki[j]] < coredist[uid]) {
+ nextpnt[ki[j]] = -1;
+ reachability[ki[j]] = -1;
+ nextpnt[uid] = ki[j];
+ }
+ else {
+ nextpnt[uid] = -1;
+ reachability[uid] = -1;
+ reachability[uid] = -1;
+ }
+ }
}
}
}
- if (kdfound) {
- G_free(kddist);
- G_free(kduid);
+ G_percent(npoints, npoints, 4);
+
+ /* create clusters from reachability network */
+ G_message(_("Building clusters ..."));
+ G_percent(0, nlines, 4);
+ for (i = 1; i <= nlines; i++) {
+
+ G_percent(i, nlines, 4);
+
+ if (cid[i] > 0 || coredist[i] == -1 || nextpnt[i] == -1)
+ continue;
+
+ if (cid[nextpnt[i]] > 0) {
+ cid[i] = idx[cid[nextpnt[i]]];
+ }
+ else {
+ /* start new cluster */
+ nclusters++;
+ cid[i] = nclusters;
+ idx[nclusters] = nclusters;
+ uid = nextpnt[i];
+
+ while (uid > 0) {
+ if (cid[uid] == 0) {
+ cid[uid] = nclusters;
+
+ uid = nextpnt[uid];
+ }
+ else {
+ /* relabel */
+ OLD = cid[uid];
+ NEW = idx[OLD];
+ while (OLD != NEW) {
+ OLD = NEW;
+ NEW = idx[OLD];
+ }
+ idx[NEW] = nclusters;
+ uid = nextpnt[uid];
+ uid = -1;
+ }
+ }
+ }
}
- }
- G_percent(npoints, npoints, 4);
- if (nclusters == 0) {
- G_message(_("No clusters found, adjust option %s"), dist_opt->key);
- Vect_close(&In);
- Vect_close(&Out);
- Vect_delete(output->answer);
- exit(EXIT_SUCCESS);
- }
+ /* generate a renumbering scheme */
+ G_message(_("Generating renumbering scheme..."));
+ G_debug(1, "%d initial clusters", nclusters);
+ /* allocate final clump ID */
+ renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
+ renumber[0] = 0;
+ cat = 1;
+ G_percent(0, nclusters, 1);
+ for (i = 1; i <= nclusters; i++) {
+ G_percent(i, nclusters, 4);
+ OLD = i;
+ NEW = idx[i];
+ if (OLD != NEW) {
+ renumber[i] = 0;
+ /* find valid clump ID */
+ while (OLD != NEW) {
+ OLD = NEW;
+ NEW = idx[OLD];
+ }
+ idx[i] = NEW;
+ }
+ else
+ /* set final clump id */
+ renumber[i] = cat++;
+ }
- /* generate a renumbering scheme */
- G_message(_("Generating renumbering scheme..."));
- G_debug(1, "%d initial clusters", nclusters);
- /* allocate final clump ID */
- renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
- renumber[0] = 0;
- cat = 1;
- G_percent(0, nclusters, 1);
- for (i = 1; i <= nclusters; i++) {
- G_percent(i, nclusters, 4);
- OLD = i;
- NEW = idx[i];
- if (OLD != NEW) {
- renumber[i] = 0;
- /* find valid clump ID */
- while (OLD != NEW) {
- OLD = NEW;
- NEW = idx[OLD];
+ nclusters = cat - 1;
+
+ /* write cluster ids */
+ G_message(_("Write out cluster ids ..."));
+ Vect_rewind(&In);
+ i = 0;
+ noutliers = 0;
+ while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+ G_percent(i++, nlines, 4);
+ if (type == GV_POINT) {
+ cat = renumber[idx[cid[i]]];
+ if (!cat)
+ noutliers++;
+ Vect_cat_set(Cats, clayer, cat);
+ Vect_write_line(&Out, GV_POINT, Points, Cats);
}
- idx[i] = NEW;
}
- else
- /* set final clump id */
- renumber[i] = cat++;
+ G_percent(nlines, nlines, 4);
}
+ else if (clmethod == CL_DENSE) {
+ /* MATRUSKA
+ * clusters in clusters in clusters ...
+ * calculate core density = distance to (minpnts - 1) for each point
+ * sort points ascending by core density
+ * for each point in sorted list
+ * if point does not have a cluster id
+ * start new cluster, cluster reachability is core density of this point
+ * connect all points within cluster reachability
+ * add all connected points to list
+ * while list is not empty
+ * remove last point from list
+ * connect all points within cluster reachability
+ * add all connected points to list
+ * */
+ int *clidx;
+ double *kd;
+ int *ki;
+ double cd;
+ int k, kdcount;
+ struct ilist *CList;
- nclusters = cat - 1;
+ clp = G_malloc((nlines + 1) * sizeof(struct cl_pnt));
+ clidx = G_malloc((nlines + 1) * sizeof(int));
- /* write cluster ids */
- G_message(_("Write out cluster ids ..."));
- Vect_rewind(&In);
- i = 0;
- while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
- G_percent(i++, nlines, 4);
- if (type == GV_POINT) {
- cat = renumber[idx[cid[i]]];
- Vect_cat_set(Cats, clayer, cat);
- Vect_write_line(&Out, GV_POINT, Points, Cats);
+ kd = G_malloc(minpnts * sizeof(double));
+ ki = G_malloc(minpnts * sizeof(int));
+
+ CList = G_new_ilist();
+
+ for (i = 0; i <= nlines; i++) {
+ clp[i].c[0] = 0;
+ clp[i].c[1] = 0;
+ clp[i].c[2] = 0;
+ clp[i].cd = -1;
+ clp[i].uid = -1;
+ clidx[i] = -1;
}
+
+ /* core density */
+ G_message(_("Core density ..."));
+ kdtree_init_trav(&trav, kdt);
+ c[2] = 0.0;
+ i = 0;
+ kdcount = 0;
+ uid = -1;
+ while (kdtree_traverse(&trav, c, &uid)) {
+ G_percent(i++, npoints, 4);
+
+ /* knn search */
+ kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+ if (kdfound < minpnts)
+ G_fatal_error(_("Not enough points found"));
+
+ cd = kd[minpnts - 1];
+
+ /* list insert */
+ for (j = kdcount; j > 0; j--) {
+ if (clp[j - 1].cd <= cd)
+ break;
+ clp[j] = clp[j - 1];
+ clidx[clp[j].uid] = j;
+ }
+ clp[j].uid = uid;
+ clp[j].c[0] = c[0];
+ clp[j].c[1] = c[1];
+ clp[j].c[2] = c[2];
+ clp[j].cd = cd;
+ clidx[clp[j].uid] = j;
+ kdcount++;
+
+ }
+ G_percent(npoints, npoints, 4);
+
+ /* create clusters */
+ G_message(_("Building clusters ..."));
+ nclusters = 0;
+ for (i = 0; i < kdcount; i++) {
+ G_percent(i, kdcount, 4);
+
+ if (cid[clp[i].uid] > 0)
+ continue;
+
+ /* knn search */
+ kdfound = kdtree_knn(kdt, clp[i].c, ki, kd, minpnts, &clp[i].uid);
+ if (kdfound < minpnts)
+ G_fatal_error(_("Not enough points found"));
+
+ /* start a new cluster */
+ uid = clp[i].uid;
+ nclusters++;
+ cat = nclusters;
+ cid[uid] = cat;
+ cd = clp[i].cd;
+ CList->n_values = 0;
+ for (j = 0; j < kdfound; j++) {
+ if (cid[ki[j]] == 0) {
+ G_ilist_add(CList, clidx[ki[j]]);
+ cid[ki[j]] = cat;
+ }
+ }
+ if (CList->n_values < minpnts) {
+ CList->n_values = 0;
+ nclusters--;
+ cid[uid] = 0;
+ for (j = 0; j < kdfound; j++) {
+ if (cid[ki[j]] == cat) {
+ cid[ki[j]] = 0;
+ }
+ }
+ }
+
+ while (CList->n_values) {
+ /* expand cluster */
+ CList->n_values--;
+ k = CList->value[CList->n_values];
+ if (k < 0)
+ G_fatal_error("expand cluster: k < 0");
+ if (clp[k].uid < 1)
+ G_fatal_error("expand cluster: clp[k].uid < 1");
+
+ kdfound = kdtree_knn(kdt, clp[k].c, ki, kd, minpnts, &clp[k].uid);
+ if (kdfound < minpnts)
+ G_fatal_error(_("Not enough points found"));
+
+ for (j = 0; j < kdfound; j++) {
+ if (kd[j] <= cd && cid[ki[j]] == 0) {
+ cid[ki[j]] = cat;
+ if (clidx[ki[j]] < 0)
+ G_fatal_error("expand cluster ngbrs: clidx[ki[j]] < 0");
+ G_ilist_add(CList, clidx[ki[j]]);
+ }
+ }
+ }
+ }
+ G_percent(kdcount, kdcount, 4);
+
+ /* write cluster ids */
+ G_message(_("Write out cluster ids ..."));
+ Vect_rewind(&In);
+ i = 0;
+ noutliers = 0;
+ while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+ G_percent(i++, nlines, 4);
+ if (type == GV_POINT) {
+ cat = cid[i];
+ if (!cat)
+ noutliers++;
+ Vect_cat_set(Cats, clayer, cat);
+ Vect_write_line(&Out, GV_POINT, Points, Cats);
+ }
+ }
+ G_percent(nlines, nlines, 4);
}
- G_percent(nlines, nlines, 4);
if (!flag_attr->answer)
Vect_copy_tables(&In, &Out, 0);
@@ -359,6 +1124,110 @@
Vect_close(&Out);
G_message(_("%d clusters found"), nclusters);
+ G_message(_("%d outliers found"), noutliers);
exit(EXIT_SUCCESS);
}
+
+/* min heap */
+
+/* compare heap points */
+int cmp_pnt(int a, int b)
+{
+ if (clp[a].reach < clp[b].reach)
+ return 1;
+ if (clp[a].reach > clp[b].reach)
+ return 0;
+ if (clp[a].uid < clp[b].uid)
+ return 1;
+ return 0;
+}
+
+/* standard sift-up routine for d-ary min heap */
+static int sift_up(int start)
+{
+ register int parent, child, child_added;
+
+ child = start;
+ child_added = heapidx[child];
+
+ while (child > 1) {
+ GET_PARENT(parent, child);
+
+ /* child smaller */
+ if (cmp_pnt(child_added, heapidx[parent])) {
+ /* push parent point down */
+ heapidx[child] = heapidx[parent];
+ child = parent;
+ }
+ else
+ /* no more sifting up, found new slot for child */
+ break;
+ }
+
+ /* put point in new slot */
+ if (child < start) {
+ heapidx[child] = child_added;
+ }
+
+ return 0;
+}
+
+/* add point routine for min heap */
+int add_pt(int idx)
+{
+ /* add point to next free position */
+ heapsize++;
+
+ heapidx[heapsize] = idx;
+
+ /* sift up: move new point towards top of heap */
+ sift_up(heapsize);
+
+ return 0;
+}
+
+/* drop point routine for min heap */
+int drop_pt(void)
+{
+ register int child, childr, parent;
+ register int i, idx;
+
+ idx = heapidx[1];
+ if (heapsize == 1) {
+ heapidx[1] = -1;
+ heapsize = 0;
+ return idx;
+ }
+
+ /* start with root */
+ parent = 1;
+
+ /* sift down: move hole back towards bottom of heap */
+ while (GET_CHILD(child, parent) <= heapsize) {
+
+ i = child + 3;
+ for (childr = child + 1; childr <= heapsize && childr < i; childr++) {
+ if (cmp_pnt(heapidx[childr], heapidx[child])) {
+ child = childr;
+ }
+ }
+
+ /* move hole down */
+ heapidx[parent] = heapidx[child];
+ parent = child;
+ }
+
+ /* hole is in lowest layer, move to heap end */
+ if (parent < heapsize) {
+ heapidx[parent] = heapidx[heapsize];
+
+ /* sift up last swapped point, only necessary if hole moved to heap end */
+ sift_up(parent);
+ }
+
+ /* the actual drop */
+ heapsize--;
+
+ return idx;
+}
Modified: grass/trunk/vector/v.cluster/v.cluster.html
===================================================================
--- grass/trunk/vector/v.cluster/v.cluster.html 2015-01-08 22:05:12 UTC (rev 64016)
+++ grass/trunk/vector/v.cluster/v.cluster.html 2015-01-08 22:28:02 UTC (rev 64017)
@@ -1,30 +1,108 @@
<h2>DESCRIPTION</h2>
<em>v.cluster</em> partitions a point cloud into clusters or clumps.
-A point can only be in a cluster if the maximum distance to its <i>min</i>
-neighbors is smaller than distance. This algoritm is known as
-<a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN</a>.
<p>
-If the minimum number of points is not given with the <i>min</i> option,
-the minimum number of points to consitute a cluster is <i>number of dimensions + 1</i>,
-i.e. 3 for 2D points and 4 for 3d points.
+If the minimum number of points is not specified with the <i>min</i>
+option, the minimum number of points to constitute a cluster is
+<i>number of dimensions + 1</i>, i.e. 3 for 2D points and 4 for 3D
+points.
+<p>
+If the maximum distance is not specified with the <i>distance</i>
+option, the maximum distance is estimated from the observed distances
+to the neighbors using the upper 99% confidence interval.
+<p>
+<em>v.cluster</em> supports different methods for clustering. The
+recommended methods are <i>method=dbscan</i> if all clusters should
+have a density (maximum distance between points) not larger than
+<i>distance</i> or <i>method=density</i> if clusters should be created
+separately for each observed density (distance to the farthest neighbor).
+
+<h4>dbscan</h4>
+The <a href="http://en.wikipedia.org/wiki/DBSCAN">Density-Based Spatial
+Clustering of Applications with Noise</a> is a commonly used clustering
+algorithm. A new cluster is started for a point with at least
+<i>min</i> - 1 neighbors within the maximum distance. These neighbors
+are added to the cluster. The cluster is then expanded as long as at
+least <i>min</i> - 1 neighbors are within the maximum distance for each
+point already in the cluster.
+
+<h4>dbscan2</h4>
+Similar to <i>dbscan</i>, but here it is sufficient if the resultant
+cluster consists of at least <i>min</i> points, even if no point in the
+cluster has at least <i>min</i> -1 neighbors within <i>distance</i>.
+
+<h4>density</h4>
+This method creates clusters according to their point density. The
+maximum distance is not used. Instead, the points are sorted ascending
+by the distance to their farthest neighbor (core distance), inspecting
+<i>min</i> - 1 neighbors. The densest cluster is created first, using
+as threshold the core distance of the seed point. The cluster is
+expanded as for DBSCAN, with the difference that each cluster has its
+own maximum distance. This method can identify clusters with different
+densities and can create nested clusters.
+
+<h4>optics</h4>
+This method is <a
+href="http://en.wikipedia.org/wiki/OPTICS_algorithm">Ordering Points to
+Identify the Clustering Structure</a>. It is controlled by the number
+of neighbor points (option <i>min</i> - 1). The core distance of a
+point is the distance to the farthest neighbor. The reachability of a
+point <i>q</i> is its distance from a point <i>p</i> (original optics:
+max(core-distance(p), distance(p, q))). The aim of the <i>optics</i>
+method is to reduce the reachability of each point. Each unprocessed
+point is the seed for a new cluster. Its neighbors are added to a queue
+sorted by smallest reachability if their reachability can be reduced.
+The points in the queue are processed and their unprocessed neighbors
+are added to a queue sorted by smallest reachability if their
+reachability can be reduced.
+
+<p>
+The <i>optics</i> method does not create clusters itself, but produces
+an ordered list of the points together with their reachability. The
+output list is ordered according to the order of processing: the first
+point processed is the first in the list, the last point processed is
+the last in the list. Clusters can be extracted from this list by
+identifying valleys in the points' reachability, e.g. by using a
+threshold value. If a maximum distance is specified, this is used to
+identify clusters, otherwise each separated network will constitute a
+cluster.
+
+<p>
+The OPTICS algorithm uses each yet unprocessed point to start a new
+cluster. The order of the input points is arbitrary and can thus
+influence the resultant clusters.
+
+<h4>optics2</h4>
+<b>EXPERIMENTAL</b> This method is similar to OPTICS, minimizing the
+reachability of each point. Points are reconnected if their
+reachability can be reduced. Contrary to OPTICS, a cluster's seed is
+not fixed but changed if possible. Each point is connected to another
+point until the core of the cluster (seed point) is reached.
+Effectively, the initial seed is updated in the process. Thus separated
+networks of points are created, with each network representing a
+cluster. The maximum distance is not used.
+
<h2>EXAMPLE</h2>
-Analysis of random points for areas in the vector <i>urbanarea</i> in the
-North Carolina sample dataset:
+Analysis of random points for areas in areas of the vector
+<i>urbanarea</i> (North Carolina sample dataset).
+<p>
+10000 random points within the areas the vector urbanarea and within the
+subregion:
+
<div class="code"><pre>
-# pick a subregion of he vector urbanarea
+# pick a subregion of the vector urbanarea
g.region -p n=272950 s=188330 w=574720 e=703090 res=10
# create clustered points
-v.random output=rand_clust npoints=1000000 restrict=urbanarea at PERMANENT
+v.random output=rand_clust npoints=10000 restrict=urbanarea at PERMANENT
# identify clusters
-v.cluster in=rand_clust out=rand_clusters
+v.cluster in=rand_clust out=rand_clusters method=dbscan
# create colors for clusters
v.db.addtable map=rand_clusters layer=2 columns="cat integer,grassrgb varchar(11)"
@@ -33,10 +111,28 @@
# display with your preferred method
</pre></div>
-<h2>TODO</h2>
+<p>
+100 random points for each area in the vector urbanarea and within the
+subregion:
-Implement <a href="http://en.wikipedia.org/wiki/OPTICS_algorithm">OPTICS</a>
+<div class="code"><pre>
+# pick a subregion of the vector urbanarea
+g.region -p n=272950 s=188330 w=574720 e=703090 res=10
+# create clustered points
+v.random output=rand_clust npoints=100 restrict=urbanarea at PERMANENT -a
+
+# identify clusters
+v.cluster in=rand_clust out=rand_clusters method=density
+
+# create colors for clusters
+v.db.addtable map=rand_clusters layer=2 columns="cat integer,grassrgb varchar(11)"
+v.colors map=rand_clusters layer=2 use=cat color=random rgb_column=grassrgb
+
+# display with your preferred method
+</pre></div>
+
+
<h2>SEE ALSO</h2>
<em>
More information about the grass-commit
mailing list