[GRASS-SVN] r59417 - grass/branches/develbranch_6/vector/v.kcv

svn_grass at osgeo.org svn_grass at osgeo.org
Thu Mar 27 07:08:45 PDT 2014


Author: neteler
Date: 2014-03-27 07:08:45 -0700 (Thu, 27 Mar 2014)
New Revision: 59417

Modified:
   grass/branches/develbranch_6/vector/v.kcv/main.c
Log:
v.kcv: backport of r57231, using patch in trac #2035

Modified: grass/branches/develbranch_6/vector/v.kcv/main.c
===================================================================
--- grass/branches/develbranch_6/vector/v.kcv/main.c	2014-03-27 14:08:26 UTC (rev 59416)
+++ grass/branches/develbranch_6/vector/v.kcv/main.c	2014-03-27 14:08:45 UTC (rev 59417)
@@ -1,29 +1,23 @@
-/*
- * from s.kcv
- * Copyright (C) 1993-1994. James Darrell McCauley.
+
+/****************************************************************
  *
- * Author: James Darrell McCauley darrell at mccauley-usa.com
- *                                http://mccauley-usa.com/
+ * MODULE:     v.kcv
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * AUTHOR(S):  James Darrell McCauley darrell at mccauley-usa.com
+ *             OGR support by Martin Landa <landa.martin gmail.com> (2009)
+ *             Spped-up by Jan Vandrol and Jan Ruzicka (2013)
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * PURPOSE:    Randomly partition points into test/train sets.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * COPYRIGHT:  (C) 2001-2009 by James Darrell McCauley, and the GRASS Development Team
  *
- * Modification History:
- * 4.2B <27 Jan 1994>  fixed RAND_MAX for Solaris 2.3
- * <13 Sep 2000> released under GPL
- * 10/2004 Upgrade to 5.7 Radim Blazek
- */
+ *             This program is free software under the GNU General
+ *             Public License (>=v2).  Read the file COPYING that
+ *             comes with GRASS for details.
+ *
+ ****************************************************************/
+
+
 #include <stdlib.h>
 #include <unistd.h>
 #include <math.h>
@@ -31,96 +25,87 @@
 #include <grass/dbmi.h>
 #include <grass/Vect.h>
 #include <grass/glocale.h>
-#include "kcv.h"
 
-#ifndef RAND_MAX
-#define RAND_MAX (pow(2.0,31.0)-1)
-#endif
-#if defined(__CYGWIN__) || defined(__APPLE__) || defined(__MINGW32__)
-double drand48()
+static double myrand(void)
 {
-    return (rand() / 32767.0);
+    return rand() / (1.0 + RAND_MAX);
 }
 
-#define srand48(sv) (srand((unsigned)(sv)))
-#else
-double drand48();
-void srand48();
-#endif
-
 struct Cell_head window;
 
-int main(int argc, char *argv[])
+int main(int argc, char *argv[]) 
 {
-    int line, nlines, nlinks;
-    double east, north, (*rng) (), max, myrand();
-    int i, j, n, nsites, verbose, np, *p, dcmp();
-    int *pnt_part;
-    struct Map_info In, Out;
-    static struct line_pnts *Points;
+    int i, line, nlines, nlinks;
+    int idx, *line_idx, lines_left;
+    double (*rng) ();
+    int nsites, p, np, min_count, spill;
+    struct partition {
+	int id, count, max;
+    } *part;
+    int last_pidx;
+    struct Map_info Map;
+    struct line_pnts *Points;
     struct line_cats *Cats;
-    char *mapset;
     struct GModule *module;
-    struct Option *in_opt, *out_opt, *col_opt, *npart_opt;
-    struct Flag *drand48_flag, *q_flag;
-    BOUND_BOX box;
-    double maxdist;
-
+    struct Option *map_opt, *col_opt, *npart_opt, *field_opt;
+    struct Flag *drand48_flag;
+ 
     /* Attributes */
     struct field_info *Fi;
-    dbDriver *Driver;
+    int layer;
+    dbDriver * Driver;
     char buf[2000];
     dbString sql;
 
+
     module = G_define_module();
-    module->keywords = _("vector, statistics");
+    module->keywords = _("vector, statistics, points");
     module->description =
 	_("Randomly partition points into test/train sets.");
 
-    in_opt = G_define_standard_option(G_OPT_V_INPUT);
-    out_opt = G_define_standard_option(G_OPT_V_OUTPUT);
+    map_opt = G_define_standard_option(G_OPT_V_MAP);
 
+    field_opt = G_define_standard_option(G_OPT_V_FIELD);
+
     npart_opt = G_define_option();
     npart_opt->key = "k";
     npart_opt->type = TYPE_INTEGER;
     npart_opt->required = YES;
-    npart_opt->description = _("Number of partitions");
-    npart_opt->options = "1-32767";
+    npart_opt->label = _("Number of partitions");
+    npart_opt->description = _("Must be > 1");
 
-    col_opt = G_define_option();
-    col_opt->key = "column";
-    col_opt->type = TYPE_STRING;
-    col_opt->required = YES;
-    col_opt->multiple = NO;
+    col_opt = G_define_standard_option(G_OPT_COLUMN);
     col_opt->answer = "part";
     col_opt->description =
 	_("Name for new column to which partition number is written");
 
     drand48_flag = G_define_flag();
     drand48_flag->key = 'd';
+#ifdef HAVE_DRAND48
     drand48_flag->description = _("Use drand48()");
+#else
+    drand48_flag->description = _("Use drand48() (ignored)");
+#endif
 
-    /* please remove in GRASS 7 */
-    q_flag = G_define_flag();
-    q_flag->key = 'q';
-    q_flag->description = _("Quiet");
 
     G_gisinit(argv[0]);
 
     if (G_parser(argc, argv))
 	exit(EXIT_FAILURE);
 
-    verbose = (!q_flag->answer);
     np = atoi(npart_opt->answer);
+    if (np < 2)
+	G_fatal_error(_("'%s' must be > 1"), npart_opt->key);
 
+#ifdef HAVE_DRAND48
     if (drand48_flag->answer) {
 	rng = drand48;
-	max = 1.0;
 	srand48((long)getpid());
     }
-    else {
+    else
+#endif
+    {
 	rng = myrand;
-	max = RAND_MAX;
 	srand(getpid());
     }
 
@@ -128,160 +113,224 @@
     Cats = Vect_new_cats_struct();
 
     /* open input vector */
-    if ((mapset = G_find_vector2(in_opt->answer, "")) == NULL) {
-	G_fatal_error(_("Vector map <%s> not found"), in_opt->answer);
-    }
-
     Vect_set_open_level(2);
-    if (Vect_open_old(&In, in_opt->answer, mapset) < 2) {
+    if (Vect_open_old(&Map, map_opt->answer, G_mapset()) < 2) {
 	G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
-		      in_opt->answer, 2);
+		      map_opt->answer, 2);
     }
 
-    Vect_get_map_box(&In, &box);
+    layer = atoi(field_opt->answer);
+    if (layer <= 0)
+	G_fatal_error(_("Layer number must be positive"));
 
-    nsites = Vect_get_num_primitives(&In, GV_POINT);
+    nsites = Vect_get_num_primitives(&Map, GV_POINT);
 
     if (nsites < np) {
-	G_fatal_error("Sites found: %i\nMore partitions than sites.", nsites);
+	G_fatal_error(_("More partitions than points (%d)"), nsites);
     }
 
-    Vect_set_fatal_error(GV_FATAL_PRINT);
-    Vect_open_new(&Out, out_opt->answer, Vect_is_3d(&In));
-
-    Vect_copy_head_data(&In, &Out);
-    Vect_hist_copy(&In, &Out);
-    Vect_hist_command(&Out);
-
-    /* Copy vector lines */
-    Vect_copy_map_lines(&In, &Out);
-
-    /* Copy tables */
-    if (Vect_copy_tables(&In, &Out, 0))
-	G_warning(_("Failed to copy attribute table to output map"));
-
     /* Add column */
     db_init_string(&sql);
 
     /* Check if there is a database for output */
-    nlinks = Vect_get_num_dblinks(&Out);
+    nlinks = Vect_get_num_dblinks(&Map);
     if (nlinks < 1)
-	Fi = Vect_default_field_info(&Out, 1, NULL, GV_1TABLE);
+	Fi = Vect_default_field_info(&Map, layer, NULL, GV_1TABLE);
     else
-	Fi = Vect_get_field(&Out, 1);
+	Fi = Vect_get_field(&Map, layer);
     if (Fi == NULL) {
-	Vect_delete(Out.name);
 	G_fatal_error(_("Unable to get layer info for vector map <%s>"),
-		      out_opt->answer);
+		      map_opt->answer);
     }
 
     Driver = db_start_driver_open_database(Fi->driver, Fi->database);
     if (Driver == NULL) {
-	Vect_delete(Out.name);
 	G_fatal_error(_("Unable to open database <%s> by driver <%s>"),
 		      Fi->database, Fi->driver);
     }
 
+    buf[0] = '\0';
     if (nlinks < 1)
 	sprintf(buf, "create table %s (%s integer, %s integer)", Fi->table,
 		Fi->key, col_opt->answer);
-    else
-	sprintf(buf, "alter table %s add column %s integer", Fi->table,
-		col_opt->answer);
+    else {
+	dbColumn *column = NULL;
 
-    db_set_string(&sql, buf);
+	/* check if column exists */
+	db_get_column(Driver, Fi->table, col_opt->answer, &column);
+	if (column) {
+            int sqltype = db_get_column_sqltype(column);
+	    int ctype = db_sqltype_to_Ctype(sqltype);
 
-    G_debug(3, "SQL: %s", db_get_string(&sql));
+	    if (ctype != DB_C_TYPE_INT && ctype != DB_C_TYPE_DOUBLE) {
+		G_fatal_error(_("Column <%s> already exists but is not numeric"),
+		              col_opt->answer);
+	    }
+	}
+	else
+	    sprintf(buf, "alter table %s add column %s integer", Fi->table,
+		    col_opt->answer);
+    }
 
-    if (db_execute_immediate(Driver, &sql) != DB_OK) {
-	Vect_delete(Out.name);
-	G_fatal_error(_("Cannot alter table: %s"), db_get_string(&sql));
+    if (*buf) {
+	db_set_string(&sql, buf);
+
+	G_debug(3, "SQL: %s", db_get_string(&sql));
+
+	if (db_execute_immediate(Driver, &sql) != DB_OK) {
+	    G_fatal_error(_("Unable to alter table: %s"), db_get_string(&sql));
+	}
     }
-    if (nlinks < 1)
-	Vect_map_add_dblink(&Out, Fi->number, Fi->name, Fi->table, Fi->key,
+
+    if (nlinks < 1) {
+	Vect_set_open_level(1);
+	Vect_close(&Map);
+	if (Vect_open_update_head(&Map, map_opt->answer, G_mapset()) < 1)
+	    G_fatal_error(_("Unable to modify vector map stored in other mapset"));
+	Vect_map_add_dblink(&Map, layer, Fi->name, Fi->table, Fi->key,
 			    Fi->database, Fi->driver);
+	Vect_close(&Map);
 
-    /*
-     * make histogram of number sites in each test partition since the
-     * number of sites will not always be a multiple of the number of
-     * partitions. make_histo() returns the maximum bin size.
-     */
-    n = make_histo(&p, np, nsites);
+	if (db_create_index2(Driver, Fi->table, Fi->key) !=
+	    DB_OK)
+	    G_warning(_("Cannot create index"));
 
-    nlines = Vect_get_num_lines(&In);
-    pnt_part = (int *)G_calloc(nlines + 1, sizeof(int));
+	if (db_grant_on_table(Driver, Fi->table, DB_PRIV_SELECT,
+	                      DB_GROUP | DB_PUBLIC) != DB_OK)
+	    G_warning(_("Cannot grant privileges on table %s"),
+		      Fi->table);
 
-    maxdist = 1.1 * sqrt(pow(box.E - box.W, 2.0) + pow(box.N - box.S, 2.0));
+	G_important_message(_("Select privileges were granted on the table"));
 
-    /* Assign fold to each point */
-    for (i = 0; i < np; ++i) {	/* for each partition */
-	for (j = 0; j < p[i]; ++j) {	/* create p[i] random points */
-	    int nearest = 0;
-	    double dist;
+	Vect_set_open_level(2);
+	if (Vect_open_old(&Map, map_opt->answer, G_mapset()) < 2) {
+	    G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
+			  map_opt->answer, 2);
+	}
+    }
 
-	    east = rng() / max * (box.E - box.W) + box.W;
-	    north = rng() / max * (box.N - box.S) + box.S;
+    /* nlines - count of records */ 
+    nlines = Vect_get_num_lines(&Map);
 
-	    G_debug(3, "east = %f north = %f", east, north);
+    /* last_pidx - current maximal index of partition array */ 
+    last_pidx = np - 1;
 
-	    /* find nearest */
-	    for (line = 1; line <= nlines; line++) {
-		int type;
-		double cur_dist;
+    /* minimum number of features in one partition */
+    min_count = nsites / np;
+    G_debug(1, "min count: %d", min_count);
 
-		if (pnt_part[line] > 0)
-		    continue;
+    /* number of partitions that need min_count + 1 features */
+    spill = nsites - np * min_count;  /* nsites % np, but modulus is evil */
+    G_debug(1, "spill: %d", spill);
 
-		type = Vect_read_line(&In, Points, Cats, line);
+    /* array of available partitions */ 
+    part = (struct partition *)G_calloc(np, sizeof(struct partition));
+    if (!part)
+	G_fatal_error(_("Out of memory"));
 
-		if (!(type & GV_POINT))
-		    continue;
+    /* line index */ 
+    line_idx = (int *)G_calloc(nlines, sizeof(int));
 
-		cur_dist = hypot(Points->x[0] - east, Points->y[0] - north);
+    /* initialization of arrays */ 
+    for (p = 0; p < np; p++) {
+	part[p].id = p + 1;	/* partition ids */
+	part[p].count = 0;
+	part[p].max = min_count;
+    }
+    for (p = 0; p < spill; p++) {
+	part[p].max++;
+    }
 
-		if (nearest < 1 || cur_dist < dist) {
-		    nearest = line;
-		    dist = cur_dist;
-		}
-	    }
+    for (i = 0; i < nlines; i++)
+	line_idx[i] = i + 1;
 
-	    G_debug(3, "nearest = %d", nearest);
+    /* proper randomization requires a 2-step randomization
+     * randomize in space
+     * randomize partition assignment
+     * looping sequentially through all points creates a bias */
 
-	    /* Update */
-	    if (nearest > 0) {	/* shopuld be always */
-		int cat;
+    /* process all points */
+    db_begin_transaction(Driver);
+    lines_left = nlines;
+    while (lines_left) {
+	int type, cat;
 
-		Vect_read_line(&In, Points, Cats, nearest);
-		Vect_cat_get(Cats, 1, &cat);
+	G_percent(nlines - lines_left, nlines, 4);
 
-		if (nlinks < 1)
-		    sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
-			    Fi->table, Fi->key, col_opt->answer, cat, i + 1);
-		else
-		    sprintf(buf, "update %s set %s = %d where %s = %d",
-			    Fi->table, col_opt->answer, i + 1, Fi->key, cat);
+	/* random point */
+	do {
+	    idx = rng() * lines_left;
+	    /* in case rng() returns 1.0: */
+	} while (idx >= lines_left);
 
-		db_set_string(&sql, buf);
+	line = line_idx[idx];
+	lines_left--;
+	line_idx[idx] = line_idx[lines_left];
 
-		G_debug(3, "SQL: %s", db_get_string(&sql));
+	if (!Vect_line_alive(&Map, line))
+	    continue;
 
-		if (db_execute_immediate(Driver, &sql) != DB_OK) {
-		    Vect_delete(Out.name);
-		    G_fatal_error(_("Unable to insert row: %s"),
-				  db_get_string(&sql));
-		}
-		pnt_part[nearest] = i + 1;
+	type = Vect_read_line(&Map, Points, Cats, line);
 
+	if (!(type & GV_POINT))
+	    continue;
+
+	/* random partition for current point */
+	do {
+	    p = rng() * (last_pidx + 1);
+	    /* in case rng() returns 1.0: */
+	} while (p > last_pidx);
+
+	G_debug(3, "partition id = %d", part[p].id);
+
+	Vect_cat_get(Cats, 1, &cat);
+	if (cat < 0) {
+	    G_warning(_("No category for line %d in layer %d"), line, layer);
+	    continue;
+	}
+	if (nlinks < 1)
+	    sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
+			 Fi->table, Fi->key, col_opt->answer, cat,
+			 part[p].id);
+	else
+	    sprintf(buf, "update %s set %s = %d where %s = %d", 
+	                 Fi->table, col_opt->answer, part[p].id,
+			 Fi->key, cat);
+
+	db_set_string(&sql, buf);
+
+	G_debug(3, "SQL: %s", db_get_string(&sql));
+
+	if (db_execute_immediate(Driver, &sql) != DB_OK) {
+	    G_fatal_error(_("Unable to insert row: %s"),
+	    db_get_string(&sql));
+	}
+
+	/* increase count of features in partition */
+	part[p].count++;
+	/* if the partition is full */
+	if (part[p].count >= part[p].max) {
+	    /* move last partition to current position */
+	    if (p != last_pidx)
+		part[p] = part[last_pidx];
+
+	    /* disable last partition */
+	    last_pidx--;
+
+	    if (last_pidx < 0 && lines_left) {
+		G_fatal_error(_("internal error"));
 	    }
 	}
     }
+    G_percent(1, 1, 1);
 
-    Vect_close(&In);
-
+    db_commit_transaction(Driver);
     db_close_database_shutdown_driver(Driver);
 
-    Vect_build(&Out);
-    Vect_close(&Out);
+    Vect_set_db_updated(&Map);
+    Vect_close(&Map);
 
     exit(EXIT_SUCCESS);
 }
+
+



More information about the grass-commit mailing list