[GRASS-SVN] r64017 - grass/trunk/vector/v.cluster

svn_grass at osgeo.org svn_grass at osgeo.org
Thu Jan 8 14:28:02 PST 2015


Author: mmetz
Date: 2015-01-08 14:28:02 -0800 (Thu, 08 Jan 2015)
New Revision: 64017

Modified:
   grass/trunk/vector/v.cluster/main.c
   grass/trunk/vector/v.cluster/v.cluster.html
Log:
v.cluster: add methods dbscan2,density,optics,optics2

Modified: grass/trunk/vector/v.cluster/main.c
===================================================================
--- grass/trunk/vector/v.cluster/main.c	2015-01-08 22:05:12 UTC (rev 64016)
+++ grass/trunk/vector/v.cluster/main.c	2015-01-08 22:28:02 UTC (rev 64017)
@@ -18,30 +18,65 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <grass/gis.h>
 #include <grass/vector.h>
 #include <grass/glocale.h>
 #include <grass/kdtree.h>
 
+#ifdef MAX
+#undef MAX
+#endif
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define CL_DBSCAN	1
+#define CL_DBSCAN2	2
+#define CL_DENSE	3
+#define CL_OPTICS	4
+#define CL_OPTICS2	5
+
+#define GET_PARENT(p, c) ((p) = (int) (((c) - 2) / 3 + 1))
+#define GET_CHILD(c, p) ((c) = (int) (((p) * 3) - 1))
+
+
+struct cl_pnt
+{
+    int uid;
+    int prevpnt;
+    double cd;
+    double reach;
+    double c[3];
+};
+
+static struct cl_pnt *clp;
+
+static int *heapidx;
+static int heapsize;
+
+int add_pt(int idx);
+int drop_pt(void);
+
 int main(int argc, char *argv[])
 {
     struct Map_info In, Out;
-    static struct line_pnts *Points;
+    struct line_pnts *Points;
     struct line_cats *Cats;
     int i, j, type, cat, is3d;
     struct GModule *module;
-    struct Option *input, *output, *dist_opt, *min_opt, *lyr_opt;
+    struct Option *input, *output, *lyr_opt;
+    struct Option *dist_opt, *min_opt, *method_opt;
     struct Flag *flag_2d, *flag_topo, *flag_attr;
     int clayer;
     int npoints, nlines;
     int *cid, *idx, *renumber, OLD, NEW;
-    int nclusters;
+    int nclusters, noutliers;
     struct kdtree *kdt;
     struct kdtrav trav;
     double c[3];
     int uid;
     double eps;
     int ndims, minpnts;
+    int clmethod;
     double *kddist;
     int kdfound, *kduid;
 
@@ -78,6 +113,14 @@
     min_opt->required = NO;
     min_opt->label = _("Minimum number of points to create a cluster");
 
+    method_opt = G_define_option();
+    method_opt->type = TYPE_STRING;
+    method_opt->key = "method";
+    method_opt->options = "dbscan,dbscan2,density,optics,optics2";
+    method_opt->answer = "dbscan";
+    method_opt->required = NO;
+    method_opt->label = _("Clustering method");
+
     flag_2d = G_define_flag();
     flag_2d->key = '2';
     flag_2d->label = _("Force 2D clustering");
@@ -125,18 +168,34 @@
     }
 
     clayer = atoi(lyr_opt->answer);
+    if (clayer < 1)
+	G_fatal_error(_("Option %s must be positive"), lyr_opt->key);
 
+    clmethod = CL_DBSCAN;
+    if (!strcmp(method_opt->answer, "dbscan2"))
+	clmethod = CL_DBSCAN2;
+    else if (!strcmp(method_opt->answer, "density"))
+	clmethod = CL_DENSE;
+    else if (!strcmp(method_opt->answer, "optics"))
+	clmethod = CL_OPTICS;
+    else if (!strcmp(method_opt->answer, "optic2"))
+	clmethod = CL_OPTICS2;
+
     /* count points */
     G_message(_("Counting input points ..."));
     npoints = nlines = 0;
     while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
 	nlines++;
-	if (type == GV_POINT)
+	if (type == GV_POINT) {
+	    if (Vect_cat_get(Cats, clayer, &cat))
+		G_fatal_error(_("Layer %d is not empty, choose another layer"),
+		              clayer);
 	    npoints++;
+	}
     }
 
-    if (npoints == 0) {
-	G_warning(_("No points in input, nothing to do"));
+    if (npoints < minpnts + 1) {
+	G_warning(_("Not enough points in input, nothing to do"));
 	Vect_close(&In);
 	exit(EXIT_SUCCESS);
     }
@@ -160,6 +219,7 @@
     Vect_rewind(&In);
     i = 0;
     cid[0] = 0;
+    idx[0] = 0;
     while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
 	G_percent(i++, nlines, 4);
 	cid[i] = 0;
@@ -176,178 +236,883 @@
 
     kdtree_optimize(kdt, 2);
 
-    /* get epsilon */
-    if (dist_opt->answer) {
-	eps = atof(dist_opt->answer);
-	if (eps <= 0)
-	    G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
-    }
-    else {
-	int n;
-	double dist, mean, min, max, sum, sumsq, sd;
-	double *kd;
-	int *ki;
+    noutliers = nclusters = 0;
+    if (clmethod == CL_DBSCAN) {
+	/* DBSCAN 
+	 * the neighbors of each point with at least minpnts neighbors 
+	 * are added to a cluster */
 
-	/* estimate epsilon */
-	G_message(_("Estimating maximum distance ..."));
+	/* get epsilon */
+	if (dist_opt->answer) {
+	    eps = atof(dist_opt->answer);
+	    if (eps <= 0)
+		G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
+	}
+	else {
+	    int n;
+	    double dist, mean, min, max, sum, sumsq, sd;
+	    double *kd;
+	    int *ki;
+
+	    /* estimate epsilon */
+	    G_message(_("Estimating maximum distance ..."));
+	    kdtree_init_trav(&trav, kdt);
+	    c[2] = 0.0;
+	    n = 0;
+	    sum = sumsq = 0;
+	    min = 1.0 / 0.0;
+	    max = 0;
+	    kd = G_malloc(minpnts * sizeof(double));
+	    ki = G_malloc(minpnts * sizeof(int));
+	    i = 0;
+	    while (kdtree_traverse(&trav, c, &uid)) {
+		G_percent(i++, npoints, 4);
+
+		kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+		if (kdfound) {
+		    dist = sqrt(kd[kdfound - 1]);
+		    sum += dist;
+		    sumsq += dist * dist;
+		    n++;
+		    if (min > dist)
+			min = dist;
+		    if (max < dist)
+			max = dist;
+		}
+	    }
+	    G_percent(npoints, npoints, 4);
+
+	    G_free(kd);
+	    G_free(ki);
+
+	    if (!n)
+		G_fatal_error(_("No neighbors found"));
+	    
+	    mean = sum / n;
+	    sd = sqrt(sumsq / n - mean * mean);
+	    eps = mean + 1.644854 * sd; /* 90% CI */
+	    eps = mean + 2.575829 * sd; /* 99% CI */
+	    
+	    if (eps > max)
+		eps = max;
+
+	    G_message(_("Distance to the %d nearest neighbor:"), minpnts);
+	    G_message(_("Min: %g, max: %g"), min, max);
+	    G_message(_("Mean: %g"), mean);
+	    G_message(_("Standard deviation: %g"), sd);
+
+	    G_message(_("Estimated maximum distance: %g"), eps);
+	}
+
+	/* create clusters */
+	G_message(_("Building clusters ..."));
+	nclusters = 0;
 	kdtree_init_trav(&trav, kdt);
 	c[2] = 0.0;
-	n = 0;
-	sum = sumsq = 0;
-	min = 1.0 / 0.0;
-	max = 0;
-	kd = G_malloc(minpnts * sizeof(double));
-	ki = G_malloc(minpnts * sizeof(int));
+	idx[0] = 0;
 	i = 0;
 	while (kdtree_traverse(&trav, c, &uid)) {
 	    G_percent(i++, npoints, 4);
 
-	    kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+	    /* radius search */
+	    kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
+	    
+	    /* must have min neighbors within radius */
+	    if (kdfound >= minpnts) {
+
+		cat = idx[cid[uid]];
+
+		/* find latest cluster */
+		for (j = 0; j < kdfound; j++) {
+		    if (cat < idx[cid[kduid[j]]]) {
+			cat = idx[cid[kduid[j]]];
+		    }
+		}
+
+		if (cat == 0) {
+		    /* start new cluster */
+		    nclusters++;
+		    cat = nclusters;
+		    if (nclusters > nlines)
+			G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
+		    idx[nclusters] = nclusters;
+		    cid[uid] = nclusters;
+		}
+
+		/* set or update cluster ids */
+		if (cid[uid] != 0) {
+		    /* relabel */
+		    idx[cid[uid]] = cat;
+		}
+		else {
+		    cid[uid] = cat;
+		}
+
+		for (j = 0; j < kdfound; j++) {
+		    if (cid[kduid[j]] != 0) {
+			/* relabel */
+			idx[cid[kduid[j]]] = cat;
+		    }
+		    else {
+		       cid[kduid[j]] = cat;
+		    }
+		}
+	    }
 	    if (kdfound) {
-		dist = sqrt(kd[kdfound - 1]);
-		sum += dist;
-		sumsq += dist * dist;
-		n++;
-		if (min > dist)
-		    min = dist;
-		if (max < dist)
-		    max = dist;
+		G_free(kddist);
+		G_free(kduid);
 	    }
 	}
 	G_percent(npoints, npoints, 4);
 
-	G_free(kd);
-	G_free(ki);
+	if (nclusters == 0) {
+	    G_message(_("No clusters found, adjust option %s"), dist_opt->key);
+	    Vect_close(&In);
+	    Vect_close(&Out);
+	    Vect_delete(output->answer);
+	    exit(EXIT_SUCCESS);
+	}
 
-	if (!n)
-	    G_fatal_error(_("No neighbors found"));
-	
-	mean = sum / n;
-	sd = sqrt(sumsq / n - mean * mean);
-	eps = mean + 1.644854 * sd; /* 90% CI */
-	eps = mean + 2.575829 * sd; /* 99% CI */
-	
-	if (eps > max)
-	    eps = max;
+	/* generate a renumbering scheme */
+	G_message(_("Generating renumbering scheme..."));
+	G_debug(1, "%d initial clusters", nclusters);
+	/* allocate final clump ID */
+	renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
+	renumber[0] = 0;
+	cat = 1;
+	G_percent(0, nclusters, 1);
+	for (i = 1; i <= nclusters; i++) {
+	    G_percent(i, nclusters, 4);
+	    OLD = i;
+	    NEW = idx[i];
+	    if (OLD != NEW) {
+		renumber[i] = 0;
+		/* find valid clump ID */
+		while (OLD != NEW) {
+		    OLD = NEW;
+		    NEW = idx[OLD];
+		}
+		idx[i] = NEW;
+	    }
+	    else
+		/* set final clump id */
+		renumber[i] = cat++;
+	}
 
-	G_message(_("Distance to the %d nearest neighbor:"), minpnts);
-	G_message(_("Min: %g, max: %g"), min, max);
-	G_message(_("Mean: %g"), mean);
-	G_message(_("Standard deviation: %g"), sd);
+	nclusters = cat - 1;
 
-	G_message(_("Estimated maximum distance: %g"), eps);
+	/* write cluster ids */
+	G_message(_("Write out cluster ids ..."));
+	Vect_rewind(&In);
+	i = 0;
+	noutliers = 0;
+	while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+	    G_percent(i++, nlines, 4);
+	    if (type == GV_POINT) {
+		cat = renumber[idx[cid[i]]];
+		if (!cat)
+		    noutliers++;
+		Vect_cat_set(Cats, clayer, cat);
+		Vect_write_line(&Out, GV_POINT, Points, Cats);
+	    }
+	}
+	G_percent(nlines, nlines, 4);
     }
+    else if (clmethod == CL_DBSCAN2) {
+	/* DBSCAN, but cluster size must be at least minpnts + 1 */
+	int *clcnt;
 
-    /* create clusters */
-    G_message(_("Building clusters ..."));
-    nclusters = 0;
-    kdtree_init_trav(&trav, kdt);
-    c[2] = 0.0;
-    idx[0] = 0;
-    i = 0;
-    while (kdtree_traverse(&trav, c, &uid)) {
-	G_percent(i++, npoints, 4);
+	/* get epsilon */
+	if (dist_opt->answer) {
+	    eps = atof(dist_opt->answer);
+	    if (eps <= 0)
+		G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
+	}
+	else {
+	    int n;
+	    double dist, mean, min, max, sum, sumsq, sd;
+	    double *kd;
+	    int *ki;
 
-	/* radius search */
-	kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
+	    /* estimate epsilon */
+	    G_message(_("Estimating maximum distance ..."));
+	    kdtree_init_trav(&trav, kdt);
+	    c[2] = 0.0;
+	    n = 0;
+	    sum = sumsq = 0;
+	    min = 1.0 / 0.0;
+	    max = 0;
+	    kd = G_malloc(minpnts * sizeof(double));
+	    ki = G_malloc(minpnts * sizeof(int));
+	    i = 0;
+	    while (kdtree_traverse(&trav, c, &uid)) {
+		G_percent(i++, npoints, 4);
+
+		kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+		if (kdfound) {
+		    dist = sqrt(kd[kdfound - 1]);
+		    sum += dist;
+		    sumsq += dist * dist;
+		    n++;
+		    if (min > dist)
+			min = dist;
+		    if (max < dist)
+			max = dist;
+		}
+	    }
+	    G_percent(npoints, npoints, 4);
+
+	    G_free(kd);
+	    G_free(ki);
+
+	    if (!n)
+		G_fatal_error(_("No neighbors found"));
+	    
+	    mean = sum / n;
+	    sd = sqrt(sumsq / n - mean * mean);
+	    eps = mean + 1.644854 * sd; /* 90% CI */
+	    eps = mean + 2.575829 * sd; /* 99% CI */
+	    
+	    if (eps > max)
+		eps = max;
+
+	    G_message(_("Distance to the %d nearest neighbor:"), minpnts);
+	    G_message(_("Min: %g, max: %g"), min, max);
+	    G_message(_("Mean: %g"), mean);
+	    G_message(_("Standard deviation: %g"), sd);
+
+	    G_message(_("Estimated maximum distance: %g"), eps);
+	}
+
+	/* create clusters */
+	G_message(_("Building clusters ..."));
+	clcnt = G_malloc((nlines + 1) * sizeof(int));
+	for (i = 0; i <= nlines; i++)
+	    clcnt[i] = 0;
+	nclusters = 0;
+	kdtree_init_trav(&trav, kdt);
+	c[2] = 0.0;
+	idx[0] = 0;
+	i = 0;
+	while (kdtree_traverse(&trav, c, &uid)) {
+	    G_percent(i++, npoints, 4);
+
+	    /* radius search */
+	    kdfound = kdtree_dnn(kdt, c, &kduid, &kddist, eps, &uid);
+	    
+	    /* any neighbor within radius */
+	    if (kdfound > 0) {
+
+		cat = idx[cid[uid]];
+
+		/* find latest cluster */
+		for (j = 0; j < kdfound; j++) {
+		    if (cat < idx[cid[kduid[j]]]) {
+			cat = idx[cid[kduid[j]]];
+		    }
+		}
+
+		if (cat == 0) {
+		    /* start new cluster */
+		    nclusters++;
+		    cat = nclusters;
+		    if (nclusters > nlines)
+			G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
+		    idx[nclusters] = nclusters;
+		    cid[uid] = nclusters;
+		    clcnt[cat] = 1;
+		}
+
+		/* set or update cluster ids */
+		if (cid[uid] != 0) {
+		    /* relabel */
+		    idx[cid[uid]] = cat;
+		}
+		else {
+		    cid[uid] = cat;
+		    clcnt[cat]++;
+		}
+
+		for (j = 0; j < kdfound; j++) {
+		    if (cid[kduid[j]] != 0) {
+			/* relabel */
+			idx[cid[kduid[j]]] = cat;
+		    }
+		    else {
+		        cid[kduid[j]] = cat;
+			clcnt[cat]++;
+		    }
+		}
+		G_free(kddist);
+		G_free(kduid);
+	    }
+	}
+	G_percent(npoints, npoints, 4);
+
+	if (nclusters == 0) {
+	    G_message(_("No clusters found, adjust option %s"), dist_opt->key);
+	    Vect_close(&In);
+	    Vect_close(&Out);
+	    Vect_delete(output->answer);
+	    exit(EXIT_SUCCESS);
+	}
+
+	/* generate a renumbering scheme */
+	G_message(_("Generating renumbering scheme..."));
+	G_debug(1, "%d initial clusters", nclusters);
+	/* allocate final clump ID */
+	renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
+	renumber[0] = 0;
+	cat = 1;
+	G_percent(0, nclusters, 1);
+	for (i = 1; i <= nclusters; i++) {
+	    G_percent(i, nclusters, 4);
+	    OLD = i;
+	    NEW = idx[i];
+	    if (OLD != NEW) {
+		/* find valid clump ID */
+		while (OLD != NEW) {
+		    OLD = NEW;
+		    NEW = idx[OLD];
+		}
+		idx[i] = NEW;
+		clcnt[NEW] += clcnt[i];
+	    }
+	}
+	for (i = 1; i <= nclusters; i++) {
+	    OLD = i;
+	    NEW = idx[i];
+	    renumber[i] = 0;
+	    if (OLD == NEW && clcnt[NEW] > minpnts) {
+		/* set final clump id */
+		renumber[i] = cat++;
+	    }
+	}
+
+	nclusters = cat - 1;
+
+	/* write cluster ids */
+	G_message(_("Write out cluster ids ..."));
+	Vect_rewind(&In);
+	i = 0;
+	noutliers = 0;
+	while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+	    G_percent(i++, nlines, 4);
+	    if (type == GV_POINT) {
+		cat = renumber[idx[cid[i]]];
+		if (!cat)
+		    noutliers++;
+		Vect_cat_set(Cats, clayer, cat);
+		Vect_write_line(&Out, GV_POINT, Points, Cats);
+	    }
+	}
+	G_percent(nlines, nlines, 4);
+    }
+    else if (clmethod == CL_OPTICS) {
+	/* OPTICS
+	 * each pair of points is either directly connected or 
+	 * connected by a chain of other points 
+	 * for each unprocessed point p
+	 * mark as processed, append to output list
+	 * core distance of p: distance to the k-th neighbor
+	 * q: neighbor of p
+	 * reachability of q: max(dist(p, q), coredist(p))
+	 * -> needs epsilon, otherwise always coredist(p)
+	 * for each unprocessed neighbor q
+	 * if q has not been reached yet, put q in min heap
+	 * if q's reachability can be reduced, put q with new reachability in min heap
+	 * proceed with point with smallest reachability
+	 * clusters:
+	 * plot x = position in output list, y = reachability
+	 * clusters = valleys of reachability in plot
+	 * hierarchical clusters: valleys in valleys
+	 */
+
+	double *kd;
+	int *ki;
+	int k, kdpnts;
+	int *clidx;
+	int *olist, nout;
+	double newrd;
+	int isout;
+
+	kd = G_malloc(minpnts * sizeof(double));
+	ki = G_malloc(minpnts * sizeof(int));
+
+	clp = G_malloc((npoints + 1) * sizeof(struct cl_pnt));
+	heapidx = G_malloc((npoints + 1) * sizeof(int));
+	olist = G_malloc((npoints + 1) * sizeof(int));
+	clidx = G_malloc((nlines + 1) * sizeof(int));
 	
-	/* must have min neighbors within radius */
-	if (kdfound >= minpnts) {
+	heapsize = 0;
 
-	    cat = idx[cid[uid]];
+	/* get epsilon */
+	eps = 0;
+	if (dist_opt->answer) {
+	    eps = atof(dist_opt->answer);
+	    if (eps <= 0)
+		G_fatal_error(_("Option %s must be a positive number"), dist_opt->key);
+	}
 
-	    /* find latest cluster */
+	/* loading points */
+	G_message(_("Loading points ..."));
+	kdtree_init_trav(&trav, kdt);
+	c[2] = 0.0;
+	i = 0;
+	while (kdtree_traverse(&trav, c, &uid)) {
+	    G_percent(i, npoints, 4);
+	    
+	    clp[i].c[0] = c[0];
+	    clp[i].c[1] = c[1];
+	    clp[i].c[2] = c[2];
+	    clp[i].uid = uid;
+	    clp[i].cd = -1;
+	    clp[i].reach = -1;
+	    clp[i].prevpnt = -1;
+	    clidx[uid] = i;
+	    olist[i] = -1;
+	    
+	    i++;
+	}
+	G_percent(npoints, npoints, 4);
+	kdpnts = i;
+	G_debug(0, "%d points in k-d tree", kdpnts);
+
+	/* reachability network */
+	G_message(_("Reachability network ..."));
+	nout = 0;
+	for (i = 0; i < kdpnts; i++) {
+	    G_percent(i, kdpnts, 4);
+	    
+	    if (clp[i].cd > 0)
+		continue;
+
+	    /* knn search */
+	    uid = clp[i].uid;
+	    kdfound = kdtree_knn(kdt, clp[i].c, ki, kd, minpnts, &uid);
+	    if (kdfound < minpnts)
+		G_fatal_error(_("Not enough points found"));
+
+	    clp[i].cd = kd[minpnts - 1];
+	    /* no reachability for the seed point !!! */
+	    clp[i].reach = clp[i].cd; /* ok ? */
+	    olist[nout++] = i;
+	    
+	    /* initialize heap */
+	    newrd = clp[i].cd;
 	    for (j = 0; j < kdfound; j++) {
-		if (idx[cid[kduid[j]]] > cat) {
-		    cat = idx[cid[kduid[j]]];
+		if (clp[clidx[ki[j]]].cd < 0) {
+		    /* deviation from OPTICS, 
+		     * creates nicer connectivity graph */
+		    newrd = kd[j];
+		    if (clp[clidx[ki[j]]].reach < 0 || clp[clidx[ki[j]]].reach > newrd) {
+			clp[clidx[ki[j]]].reach = newrd;
+			clp[clidx[ki[j]]].prevpnt = i;
+			add_pt(clidx[ki[j]]);
+		    }
 		}
 	    }
 
-	    if (cat == 0) {
-		/* start new cluster */
-		nclusters++;
-		cat = nclusters;
-		if (nclusters > nlines)
-		    G_fatal_error(_("nlines: %d, nclusters: %d"), nlines, nclusters);
-		idx[nclusters] = nclusters;
-		cid[uid] = nclusters;
+	    while (heapsize) {
+		k = drop_pt();
+		if (k < 0 || k >= kdpnts)
+		    G_fatal_error("Invalid index");
+		if (clp[k].cd > 0)
+		    continue;
+
+		/* knn search */
+		uid = clp[k].uid;
+		kdfound = kdtree_knn(kdt, clp[k].c, ki, kd, minpnts, &uid);
+		if (kdfound < minpnts)
+		    G_fatal_error(_("Not enough points found"));
+
+		clp[k].cd = kd[minpnts - 1];
+		olist[nout++] = k;
+
+		newrd = clp[k].cd;
+		for (j = 0; j < kdfound; j++) {
+		    if (heapsize >= npoints)
+			G_fatal_error("Heap is too large");
+		    if (clp[clidx[ki[j]]].cd < 0) {
+			/* deviation from OPTICS, 
+			 * creates nicer connectivity graph */
+			newrd = kd[j];
+			if (clp[clidx[ki[j]]].reach < 0 || clp[clidx[ki[j]]].reach > newrd) {
+			    clp[clidx[ki[j]]].reach = newrd;
+			    clp[clidx[ki[j]]].prevpnt = k;
+			    add_pt(clidx[ki[j]]);
+			}
+		    }
+		}
 	    }
+	}
+	G_percent(kdpnts, kdpnts, 4);
+	G_debug(0, "nout: %d", nout);
+	if (nout != kdpnts)
+	    G_fatal_error("nout != kdpnts");
 
-	    /* set or update cluster ids */
-	    if (cid[uid] != 0) {
-		/* relabel */
-		idx[cid[uid]] = cat;
-	    }
+	/* set cluster ids */
+	G_message(_("Set cluster ids ..."));
+	isout = 1;
+	nclusters = 0;
+	for (i = 0; i < kdpnts; i++) {
+	    G_percent(i, kdpnts, 4);
+
+	    if (eps > 0 && clp[olist[i]].reach > eps)
+		isout = 1;
 	    else {
-		cid[uid] = cat;
+		if (isout || clp[olist[i]].prevpnt == -1) {
+		    isout = 0;
+		    nclusters++;
+		}
+		cid[clp[olist[i]].uid] = nclusters;
 	    }
+	}
+	G_percent(kdpnts, kdpnts, 4);
 
+	/* write cluster ids */
+	G_message(_("Write out cluster ids ..."));
+	Vect_rewind(&In);
+	i = 0;
+	noutliers = 0;
+	while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+	    G_percent(i++, nlines, 4);
+	    if (type == GV_POINT) {
+		cat = cid[i];
+		if (!cat)
+		    noutliers++;
+		Vect_cat_set(Cats, clayer, cat);
+		Vect_write_line(&Out, GV_POINT, Points, Cats);
+	    }
+	}
+	G_percent(nlines, nlines, 4);
+    }
+    else if (clmethod == CL_OPTICS2) {
+	/* OPTICS modified, create separated reachability networks
+	 * for each point p
+	 *   get p's core distance
+	 *   get p's neighbors
+	 *   for each neighbor q
+	 *     new reachability of q: dist(p, q) 
+	 *     if q has been processed
+	 *       new reachability of q: max(coredist(q), dist(p, q))
+	 *     set or reduce q's reachability 
+	 *     connect q to p if q's reachability can be updated
+	 */
+
+	double *coredist;
+	double *reachability;
+	int *nextpnt;
+	double *kd;
+	int *ki;
+	double newrd;
+
+	coredist = G_malloc((nlines + 1) * sizeof(double));
+	reachability = G_malloc((nlines + 1) * sizeof(double));
+	nextpnt = G_malloc((nlines + 1) * sizeof(int));
+
+	kd = G_malloc(minpnts * sizeof(double));
+	ki = G_malloc(minpnts * sizeof(int));
+
+	for (i = 0; i <= nlines; i++) {
+	    coredist[i] = -1;
+	    reachability[i] = -1;
+	    nextpnt[i] = -1;
+	    cid[i] = 0;
+	    idx[i] = 0;
+	}
+
+	/* reachability network */
+	G_message(_("Reachability network ..."));
+	kdtree_init_trav(&trav, kdt);
+	c[2] = 0.0;
+	i = 0;
+	while (kdtree_traverse(&trav, c, &uid)) {
+	    G_percent(i++, npoints, 4);
+
+	    /* knn search */
+	    kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+	    if (kdfound < minpnts)
+		G_fatal_error(_("Not enough points found"));
+	    coredist[uid] = kd[minpnts - 1];
+
 	    for (j = 0; j < kdfound; j++) {
-		if (cid[kduid[j]] != 0) {
-		    /* relabel */
-		    idx[cid[kduid[j]]] = cat;
+		/* new reachability */
+		newrd = kd[j];
+		if (coredist[ki[j]] > kd[j]) {
+		    /* do not connect a point to its own cluster
+		     * because points in its own cluster
+		     * have already been connected to this point
+		     * or reconnected */
+		    newrd = coredist[ki[j]];
 		}
-		else {
-		   cid[kduid[j]] = cat;
+
+		if (reachability[ki[j]] == -1 || reachability[ki[j]] > newrd) {
+		    reachability[ki[j]] = newrd;
+		    nextpnt[ki[j]] = uid;
+
+		    /* no link - back link */
+		    if (nextpnt[uid] == ki[j]) {
+			if (coredist[ki[j]] == -1) {
+			    G_fatal_error(_("Neighbor point's core dist is -1"));
+			}
+			if (coredist[ki[j]] < coredist[uid]) {
+			    nextpnt[ki[j]] = -1;
+			    reachability[ki[j]] = -1;
+			    nextpnt[uid] = ki[j];
+			}
+			else {
+			    nextpnt[uid] = -1;
+			    reachability[uid] = -1;
+			    reachability[uid] = -1;
+			}
+		    }
 		}
 	    }
 	}
-	if (kdfound) {
-	    G_free(kddist);
-	    G_free(kduid);
+	G_percent(npoints, npoints, 4);
+
+	/* create clusters from reachability network */
+	G_message(_("Building clusters ..."));
+	G_percent(0, nlines, 4);
+	for (i = 1; i <= nlines; i++) {
+
+	    G_percent(i, nlines, 4);
+
+	    if (cid[i] > 0 || coredist[i] == -1 || nextpnt[i] == -1)
+		continue;
+
+	    if (cid[nextpnt[i]] > 0) {
+		cid[i] = idx[cid[nextpnt[i]]];
+	    }
+	    else {
+		/* start new cluster */
+		nclusters++;
+		cid[i] = nclusters;
+		idx[nclusters] = nclusters;
+		uid = nextpnt[i];
+
+		while (uid > 0) {
+		    if (cid[uid] == 0) {
+			cid[uid] = nclusters;
+
+			uid = nextpnt[uid];
+		    }
+		    else {
+			/* relabel */
+			OLD = cid[uid];
+			NEW = idx[OLD];
+			while (OLD != NEW) {
+			    OLD = NEW;
+			    NEW = idx[OLD];
+			}
+			idx[NEW] = nclusters;
+			uid = nextpnt[uid];
+			uid = -1;
+		    }
+		}
+	    }
 	}
-    }
-    G_percent(npoints, npoints, 4);
 
-    if (nclusters == 0) {
-	G_message(_("No clusters found, adjust option %s"), dist_opt->key);
-	Vect_close(&In);
-	Vect_close(&Out);
-	Vect_delete(output->answer);
-	exit(EXIT_SUCCESS);
-    }
+	/* generate a renumbering scheme */
+	G_message(_("Generating renumbering scheme..."));
+	G_debug(1, "%d initial clusters", nclusters);
+	/* allocate final clump ID */
+	renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
+	renumber[0] = 0;
+	cat = 1;
+	G_percent(0, nclusters, 1);
+	for (i = 1; i <= nclusters; i++) {
+	    G_percent(i, nclusters, 4);
+	    OLD = i;
+	    NEW = idx[i];
+	    if (OLD != NEW) {
+		renumber[i] = 0;
+		/* find valid clump ID */
+		while (OLD != NEW) {
+		    OLD = NEW;
+		    NEW = idx[OLD];
+		}
+		idx[i] = NEW;
+	    }
+	    else
+		/* set final clump id */
+		renumber[i] = cat++;
+	}
 
-    /* generate a renumbering scheme */
-    G_message(_("Generating renumbering scheme..."));
-    G_debug(1, "%d initial clusters", nclusters);
-    /* allocate final clump ID */
-    renumber = (int *) G_malloc((nclusters + 1) * sizeof(int));
-    renumber[0] = 0;
-    cat = 1;
-    G_percent(0, nclusters, 1);
-    for (i = 1; i <= nclusters; i++) {
-	G_percent(i, nclusters, 4);
-	OLD = i;
-	NEW = idx[i];
-	if (OLD != NEW) {
-	    renumber[i] = 0;
-	    /* find valid clump ID */
-	    while (OLD != NEW) {
-		OLD = NEW;
-		NEW = idx[OLD];
+	nclusters = cat - 1;
+
+	/* write cluster ids */
+	G_message(_("Write out cluster ids ..."));
+	Vect_rewind(&In);
+	i = 0;
+	noutliers = 0;
+	while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+	    G_percent(i++, nlines, 4);
+	    if (type == GV_POINT) {
+		cat = renumber[idx[cid[i]]];
+		if (!cat)
+		    noutliers++;
+		Vect_cat_set(Cats, clayer, cat);
+		Vect_write_line(&Out, GV_POINT, Points, Cats);
 	    }
-	    idx[i] = NEW;
 	}
-	else
-	    /* set final clump id */
-	    renumber[i] = cat++;
+	G_percent(nlines, nlines, 4);
     }
+    else if (clmethod == CL_DENSE) {
+	/* MATRUSKA 
+	 * clusters in clusters in clusters ...
+	 * calculate core density = distance to (minpnts - 1) for each point 
+	 * sort points ascending by core density
+	 * for each point in sorted list
+	 *   if point does not have a cluster id 
+	 *     start new cluster, cluster reachability is core density of this point
+	 *     connect all points within cluster reachability 
+	 *     add all connected points to list
+	 *     while list is not empty
+	 *       remove last point from list
+	 *       connect all points within cluster reachability 
+	 *       add all connected points to list
+	 *       */
+	int *clidx;
+	double *kd;
+	int *ki;
+	double cd;
+	int k, kdcount;
+	struct ilist *CList;
 
-    nclusters = cat - 1;
+	clp = G_malloc((nlines + 1) * sizeof(struct cl_pnt));
+	clidx = G_malloc((nlines + 1) * sizeof(int));
 
-    /* write cluster ids */
-    G_message(_("Write out cluster ids ..."));
-    Vect_rewind(&In);
-    i = 0;
-    while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
-	G_percent(i++, nlines, 4);
-	if (type == GV_POINT) {
-	    cat = renumber[idx[cid[i]]];
-	    Vect_cat_set(Cats, clayer, cat);
-	    Vect_write_line(&Out, GV_POINT, Points, Cats);
+	kd = G_malloc(minpnts * sizeof(double));
+	ki = G_malloc(minpnts * sizeof(int));
+
+	CList = G_new_ilist();
+
+	for (i = 0; i <= nlines; i++) {
+	    clp[i].c[0] = 0;
+	    clp[i].c[1] = 0;
+	    clp[i].c[2] = 0;
+	    clp[i].cd = -1;
+	    clp[i].uid = -1;
+	    clidx[i] = -1;
 	}
+
+	/* core density */
+	G_message(_("Core density ..."));
+	kdtree_init_trav(&trav, kdt);
+	c[2] = 0.0;
+	i = 0;
+	kdcount = 0;
+	uid = -1;
+	while (kdtree_traverse(&trav, c, &uid)) {
+	    G_percent(i++, npoints, 4);
+
+	    /* knn search */
+	    kdfound = kdtree_knn(kdt, c, ki, kd, minpnts, &uid);
+	    if (kdfound < minpnts)
+		G_fatal_error(_("Not enough points found"));
+
+	    cd = kd[minpnts - 1];
+
+	    /* list insert */
+	    for (j = kdcount; j > 0; j--) {
+		if (clp[j - 1].cd <= cd)
+		    break;
+		clp[j] = clp[j - 1];
+		clidx[clp[j].uid] = j;
+	    }
+	    clp[j].uid = uid;
+	    clp[j].c[0] = c[0];
+	    clp[j].c[1] = c[1];
+	    clp[j].c[2] = c[2];
+	    clp[j].cd = cd;
+	    clidx[clp[j].uid] = j;
+	    kdcount++;
+
+	}
+	G_percent(npoints, npoints, 4);
+
+	/* create clusters */
+	G_message(_("Building clusters ..."));
+	nclusters = 0;
+	for (i = 0; i < kdcount; i++) {
+	    G_percent(i, kdcount, 4);
+
+	    if (cid[clp[i].uid] > 0)
+		continue;
+
+	    /* knn search */
+	    kdfound = kdtree_knn(kdt, clp[i].c, ki, kd, minpnts, &clp[i].uid);
+	    if (kdfound < minpnts)
+		G_fatal_error(_("Not enough points found"));
+
+	    /* start a new cluster */
+	    uid = clp[i].uid;
+	    nclusters++;
+	    cat = nclusters;
+	    cid[uid] = cat;
+	    cd = clp[i].cd;
+	    CList->n_values = 0;
+	    for (j = 0; j < kdfound; j++) {
+		if (cid[ki[j]] == 0) {
+		    G_ilist_add(CList, clidx[ki[j]]);
+		    cid[ki[j]] = cat;
+		}
+	    }
+	    if (CList->n_values < minpnts) {
+		CList->n_values = 0;
+		nclusters--;
+		cid[uid] = 0;
+		for (j = 0; j < kdfound; j++) {
+		    if (cid[ki[j]] == cat) {
+			cid[ki[j]] = 0;
+		    }
+		}
+	    }
+
+	    while (CList->n_values) {
+		/* expand cluster */
+		CList->n_values--;
+		k = CList->value[CList->n_values];
+		if (k < 0)
+		    G_fatal_error("expand cluster: k < 0");
+		if (clp[k].uid < 1)
+		    G_fatal_error("expand cluster: clp[k].uid < 1");
+
+		kdfound = kdtree_knn(kdt, clp[k].c, ki, kd, minpnts, &clp[k].uid);
+		if (kdfound < minpnts)
+		    G_fatal_error(_("Not enough points found"));
+
+		for (j = 0; j < kdfound; j++) {
+		    if (kd[j] <= cd && cid[ki[j]] == 0) {
+			cid[ki[j]] = cat;
+			if (clidx[ki[j]] < 0)
+			    G_fatal_error("expand cluster ngbrs: clidx[ki[j]] < 0");
+			G_ilist_add(CList, clidx[ki[j]]);
+		    }
+		}
+	    }
+	}
+	G_percent(kdcount, kdcount, 4);
+
+	/* write cluster ids */
+	G_message(_("Write out cluster ids ..."));
+	Vect_rewind(&In);
+	i = 0;
+	noutliers = 0;
+	while ((type = Vect_read_next_line(&In, Points, Cats)) > 0) {
+	    G_percent(i++, nlines, 4);
+	    if (type == GV_POINT) {
+		cat = cid[i];
+		if (!cat)
+		    noutliers++;
+		Vect_cat_set(Cats, clayer, cat);
+		Vect_write_line(&Out, GV_POINT, Points, Cats);
+	    }
+	}
+	G_percent(nlines, nlines, 4);
     }
-    G_percent(nlines, nlines, 4);
 
     if (!flag_attr->answer)
 	Vect_copy_tables(&In, &Out, 0);
@@ -359,6 +1124,110 @@
     Vect_close(&Out);
 
     G_message(_("%d clusters found"), nclusters);
+    G_message(_("%d outliers found"), noutliers);
 
     exit(EXIT_SUCCESS);
 }
+
+/* min heap */
+
+/* compare heap points */
+int cmp_pnt(int a, int b)
+{
+    if (clp[a].reach < clp[b].reach)
+	return 1;
+    if (clp[a].reach > clp[b].reach)
+	return 0;
+    if (clp[a].uid < clp[b].uid)
+	return 1;
+    return 0;
+}
+
+/* standard sift-up routine for d-ary min heap */
+static int sift_up(int start)
+{
+    register int parent, child, child_added;
+
+    child = start;
+    child_added = heapidx[child];
+
+    while (child > 1) {
+	GET_PARENT(parent, child);
+
+	/* child smaller */
+	if (cmp_pnt(child_added, heapidx[parent])) {
+	    /* push parent point down */
+	    heapidx[child] = heapidx[parent];
+	    child = parent;
+	}
+	else
+	    /* no more sifting up, found new slot for child */
+	    break;
+    }
+
+    /* put point in new slot */
+    if (child < start) {
+	heapidx[child] = child_added;
+    }
+
+    return 0;
+}
+
+/* add point routine for min heap */
+int add_pt(int idx)
+{
+    /* add point to next free position */
+    heapsize++;
+
+    heapidx[heapsize] = idx;
+
+    /* sift up: move new point towards top of heap */
+    sift_up(heapsize);
+
+    return 0;
+}
+
+/* drop point routine for min heap */
+int drop_pt(void)
+{
+    register int child, childr, parent;
+    register int i, idx;
+
+    idx = heapidx[1];
+    if (heapsize == 1) {
+	heapidx[1] = -1;
+	heapsize = 0;
+	return idx;
+    }
+
+    /* start with root */
+    parent = 1;
+
+    /* sift down: move hole back towards bottom of heap */
+    while (GET_CHILD(child, parent) <= heapsize) {
+
+	i = child + 3;
+	for (childr = child + 1; childr <= heapsize && childr < i; childr++) {
+	    if (cmp_pnt(heapidx[childr], heapidx[child])) {
+		child = childr;
+	    }
+	}
+
+	/* move hole down */
+	heapidx[parent] = heapidx[child];
+	parent = child;
+    }
+
+    /* hole is in lowest layer, move to heap end */
+    if (parent < heapsize) {
+	heapidx[parent] = heapidx[heapsize];
+
+	/* sift up last swapped point, only necessary if hole moved to heap end */
+	sift_up(parent);
+    }
+
+    /* the actual drop */
+    heapsize--;
+
+    return idx;
+}

Modified: grass/trunk/vector/v.cluster/v.cluster.html
===================================================================
--- grass/trunk/vector/v.cluster/v.cluster.html	2015-01-08 22:05:12 UTC (rev 64016)
+++ grass/trunk/vector/v.cluster/v.cluster.html	2015-01-08 22:28:02 UTC (rev 64017)
@@ -1,30 +1,108 @@
 <h2>DESCRIPTION</h2>
 
 <em>v.cluster</em> partitions a point cloud into clusters or clumps. 
-A point can only be in a cluster if the maximum distance to its <i>min</i> 
-neighbors is smaller than distance. This algoritm is known as 
-<a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN</a>.
 
 <p>
-If the minimum number of points is not given with the <i>min</i> option, 
-the minimum number of points to consitute a cluster is <i>number of dimensions + 1</i>, 
-i.e. 3 for 2D points and 4 for 3d points.
+If the minimum number of points is not specified with the <i>min</i> 
+option, the minimum number of points to constitute a cluster is 
+<i>number of dimensions + 1</i>, i.e. 3 for 2D points and 4 for 3D 
+points.
 
+<p>
+If the maximum distance is not specified with the <i>distance</i> 
+option, the maximum distance is estimated from the observed distances 
+to the neighbors using the upper 99% confidence interval.
 
+<p>
+<em>v.cluster</em> supports different methods for clustering. The 
+recommended methods are <i>method=dbscan</i> if all clusters should 
+have a density (maximum distance between points) not larger than 
+<i>distance</i> or <i>method=density</i> if clusters should be created 
+separately for each observed density (distance to the farthest neighbor).
+
+<h4>dbscan</h4>
+The <a href="http://en.wikipedia.org/wiki/DBSCAN">Density-Based Spatial 
+Clustering of Applications with Noise</a> is a commonly used clustering 
+algorithm. A new cluster is started for a point with at least 
+<i>min</i> - 1 neighbors within the maximum distance. These neighbors 
+are added to the cluster. The cluster is then expanded as long as at 
+least <i>min</i> - 1 neighbors are within the maximum distance for each 
+point already in the cluster.
+
+<h4>dbscan2</h4>
+Similar to <i>dbscan</i>, but here it is sufficient if the resultant 
+cluster consists of at least <i>min</i> points, even if no point in the 
+cluster has at least <i>min</i> -1 neighbors within <i>distance</i>.
+
+<h4>density</h4>
+This method creates clusters according to their point density. The 
+maximum distance is not used. Instead, the points are sorted ascending 
+by the distance to their farthest neighbor (core distance), inspecting 
+<i>min</i> - 1 neighbors. The densest cluster is created first, using 
+as threshold the core distance of the seed point. The cluster is 
+expanded as for DBSCAN, with the difference that each cluster has its 
+own maximum distance. This method can identify clusters with different 
+densities and can create nested clusters.
+
+<h4>optics</h4>
+This method is <a 
+href="http://en.wikipedia.org/wiki/OPTICS_algorithm">Ordering Points to 
+Identify the Clustering Structure</a>. It is controlled by the number 
+of neighbor points (option <i>min</i> - 1). The core distance of a 
+point is the distance to the farthest neighbor. The reachability of a 
+point <i>q</i> is its distance from a point <i>p</i> (original optics: 
+max(core-distance(p), distance(p, q))). The aim of the <i>optics</i> 
+method is to reduce the reachability of each point. Each unprocessed 
+point is the seed for a new cluster. Its neighbors are added to a queue 
+sorted by smallest reachability if their reachability can be reduced. 
+The points in the queue are processed and their unprocessed neighbors 
+are added to a queue sorted by smallest reachability if their 
+reachability can be reduced.
+
+<p>
+The <i>optics</i> method does not create clusters itself, but produces 
+an ordered list of the points together with their reachability. The 
+output list is ordered according to the order of processing: the first 
+point processed is the first in the list, the last point processed is 
+the last in the list. Clusters can be extracted from this list by 
+identifying valleys in the points' reachability, e.g. by using a 
+threshold value. If a maximum distance is specified, this is used to 
+identify clusters, otherwise each separated network will constitute a 
+cluster.
+
+<p>
+The OPTICS algorithm uses each yet unprocessed point to start a new 
+cluster. The order of the input points is arbitrary and can thus 
+influence the resultant clusters.
+
+<h4>optics2</h4>
+<b>EXPERIMENTAL</b> This method is similar to OPTICS, minimizing the 
+reachability of each point. Points are reconnected if their 
+reachability can be reduced. Contrary to OPTICS, a cluster's seed is 
+not fixed but changed if possible. Each point is connected to another 
+point until the core of the cluster (seed point) is reached. 
+Effectively, the initial seed is updated in the process. Thus separated 
+networks of points are created, with each network representing a 
+cluster. The maximum distance is not used.
+
 <h2>EXAMPLE</h2>
 
-Analysis of random points for areas in the vector <i>urbanarea</i> in the 
-North Carolina sample dataset:
+Analysis of random points for areas in areas of the vector 
+<i>urbanarea</i> (North Carolina sample dataset).
 
+<p>
+10000 random points within the areas the vector urbanarea and within the 
+subregion:
+
 <div class="code"><pre>
-# pick a subregion of he vector urbanarea
+# pick a subregion of the vector urbanarea
 g.region -p n=272950 s=188330 w=574720 e=703090 res=10
 
 # create clustered points
-v.random output=rand_clust npoints=1000000 restrict=urbanarea at PERMANENT
+v.random output=rand_clust npoints=10000 restrict=urbanarea at PERMANENT
 
 # identify clusters
-v.cluster in=rand_clust out=rand_clusters
+v.cluster in=rand_clust out=rand_clusters method=dbscan
 
 # create colors for clusters
 v.db.addtable map=rand_clusters layer=2 columns="cat integer,grassrgb varchar(11)"
@@ -33,10 +111,28 @@
 # display with your preferred method
 </pre></div>
 
-<h2>TODO</h2>
+<p>
+100 random points for each area in the vector urbanarea and within the 
+subregion:
 
-Implement <a href="http://en.wikipedia.org/wiki/OPTICS_algorithm">OPTICS</a>
+<div class="code"><pre>
+# pick a subregion of the vector urbanarea
+g.region -p n=272950 s=188330 w=574720 e=703090 res=10
 
+# create clustered points
+v.random output=rand_clust npoints=100 restrict=urbanarea at PERMANENT -a
+
+# identify clusters
+v.cluster in=rand_clust out=rand_clusters method=density
+
+# create colors for clusters
+v.db.addtable map=rand_clusters layer=2 columns="cat integer,grassrgb varchar(11)"
+v.colors map=rand_clusters layer=2 use=cat color=random rgb_column=grassrgb
+
+# display with your preferred method
+</pre></div>
+
+
 <h2>SEE ALSO</h2>
 
 <em>



More information about the grass-commit mailing list