[GRASS-SVN] r59417 - grass/branches/develbranch_6/vector/v.kcv
svn_grass at osgeo.org
svn_grass at osgeo.org
Thu Mar 27 07:08:45 PDT 2014
Author: neteler
Date: 2014-03-27 07:08:45 -0700 (Thu, 27 Mar 2014)
New Revision: 59417
Modified:
grass/branches/develbranch_6/vector/v.kcv/main.c
Log:
v.kcv: backport of r57231, using patch in trac #2035
Modified: grass/branches/develbranch_6/vector/v.kcv/main.c
===================================================================
--- grass/branches/develbranch_6/vector/v.kcv/main.c 2014-03-27 14:08:26 UTC (rev 59416)
+++ grass/branches/develbranch_6/vector/v.kcv/main.c 2014-03-27 14:08:45 UTC (rev 59417)
@@ -1,29 +1,23 @@
-/*
- * from s.kcv
- * Copyright (C) 1993-1994. James Darrell McCauley.
+
+/****************************************************************
*
- * Author: James Darrell McCauley darrell at mccauley-usa.com
- * http://mccauley-usa.com/
+ * MODULE: v.kcv
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
+ * AUTHOR(S): James Darrell McCauley darrell at mccauley-usa.com
+ * OGR support by Martin Landa <landa.martin gmail.com> (2009)
+ * Spped-up by Jan Vandrol and Jan Ruzicka (2013)
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * PURPOSE: Randomly partition points into test/train sets.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * COPYRIGHT: (C) 2001-2009 by James Darrell McCauley, and the GRASS Development Team
*
- * Modification History:
- * 4.2B <27 Jan 1994> fixed RAND_MAX for Solaris 2.3
- * <13 Sep 2000> released under GPL
- * 10/2004 Upgrade to 5.7 Radim Blazek
- */
+ * This program is free software under the GNU General
+ * Public License (>=v2). Read the file COPYING that
+ * comes with GRASS for details.
+ *
+ ****************************************************************/
+
+
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
@@ -31,96 +25,87 @@
#include <grass/dbmi.h>
#include <grass/Vect.h>
#include <grass/glocale.h>
-#include "kcv.h"
-#ifndef RAND_MAX
-#define RAND_MAX (pow(2.0,31.0)-1)
-#endif
-#if defined(__CYGWIN__) || defined(__APPLE__) || defined(__MINGW32__)
-double drand48()
+static double myrand(void)
{
- return (rand() / 32767.0);
+ return rand() / (1.0 + RAND_MAX);
}
-#define srand48(sv) (srand((unsigned)(sv)))
-#else
-double drand48();
-void srand48();
-#endif
-
struct Cell_head window;
-int main(int argc, char *argv[])
+int main(int argc, char *argv[])
{
- int line, nlines, nlinks;
- double east, north, (*rng) (), max, myrand();
- int i, j, n, nsites, verbose, np, *p, dcmp();
- int *pnt_part;
- struct Map_info In, Out;
- static struct line_pnts *Points;
+ int i, line, nlines, nlinks;
+ int idx, *line_idx, lines_left;
+ double (*rng) ();
+ int nsites, p, np, min_count, spill;
+ struct partition {
+ int id, count, max;
+ } *part;
+ int last_pidx;
+ struct Map_info Map;
+ struct line_pnts *Points;
struct line_cats *Cats;
- char *mapset;
struct GModule *module;
- struct Option *in_opt, *out_opt, *col_opt, *npart_opt;
- struct Flag *drand48_flag, *q_flag;
- BOUND_BOX box;
- double maxdist;
-
+ struct Option *map_opt, *col_opt, *npart_opt, *field_opt;
+ struct Flag *drand48_flag;
+
/* Attributes */
struct field_info *Fi;
- dbDriver *Driver;
+ int layer;
+ dbDriver * Driver;
char buf[2000];
dbString sql;
+
module = G_define_module();
- module->keywords = _("vector, statistics");
+ module->keywords = _("vector, statistics, points");
module->description =
_("Randomly partition points into test/train sets.");
- in_opt = G_define_standard_option(G_OPT_V_INPUT);
- out_opt = G_define_standard_option(G_OPT_V_OUTPUT);
+ map_opt = G_define_standard_option(G_OPT_V_MAP);
+ field_opt = G_define_standard_option(G_OPT_V_FIELD);
+
npart_opt = G_define_option();
npart_opt->key = "k";
npart_opt->type = TYPE_INTEGER;
npart_opt->required = YES;
- npart_opt->description = _("Number of partitions");
- npart_opt->options = "1-32767";
+ npart_opt->label = _("Number of partitions");
+ npart_opt->description = _("Must be > 1");
- col_opt = G_define_option();
- col_opt->key = "column";
- col_opt->type = TYPE_STRING;
- col_opt->required = YES;
- col_opt->multiple = NO;
+ col_opt = G_define_standard_option(G_OPT_COLUMN);
col_opt->answer = "part";
col_opt->description =
_("Name for new column to which partition number is written");
drand48_flag = G_define_flag();
drand48_flag->key = 'd';
+#ifdef HAVE_DRAND48
drand48_flag->description = _("Use drand48()");
+#else
+ drand48_flag->description = _("Use drand48() (ignored)");
+#endif
- /* please remove in GRASS 7 */
- q_flag = G_define_flag();
- q_flag->key = 'q';
- q_flag->description = _("Quiet");
G_gisinit(argv[0]);
if (G_parser(argc, argv))
exit(EXIT_FAILURE);
- verbose = (!q_flag->answer);
np = atoi(npart_opt->answer);
+ if (np < 2)
+ G_fatal_error(_("'%s' must be > 1"), npart_opt->key);
+#ifdef HAVE_DRAND48
if (drand48_flag->answer) {
rng = drand48;
- max = 1.0;
srand48((long)getpid());
}
- else {
+ else
+#endif
+ {
rng = myrand;
- max = RAND_MAX;
srand(getpid());
}
@@ -128,160 +113,224 @@
Cats = Vect_new_cats_struct();
/* open input vector */
- if ((mapset = G_find_vector2(in_opt->answer, "")) == NULL) {
- G_fatal_error(_("Vector map <%s> not found"), in_opt->answer);
- }
-
Vect_set_open_level(2);
- if (Vect_open_old(&In, in_opt->answer, mapset) < 2) {
+ if (Vect_open_old(&Map, map_opt->answer, G_mapset()) < 2) {
G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
- in_opt->answer, 2);
+ map_opt->answer, 2);
}
- Vect_get_map_box(&In, &box);
+ layer = atoi(field_opt->answer);
+ if (layer <= 0)
+ G_fatal_error(_("Layer number must be positive"));
- nsites = Vect_get_num_primitives(&In, GV_POINT);
+ nsites = Vect_get_num_primitives(&Map, GV_POINT);
if (nsites < np) {
- G_fatal_error("Sites found: %i\nMore partitions than sites.", nsites);
+ G_fatal_error(_("More partitions than points (%d)"), nsites);
}
- Vect_set_fatal_error(GV_FATAL_PRINT);
- Vect_open_new(&Out, out_opt->answer, Vect_is_3d(&In));
-
- Vect_copy_head_data(&In, &Out);
- Vect_hist_copy(&In, &Out);
- Vect_hist_command(&Out);
-
- /* Copy vector lines */
- Vect_copy_map_lines(&In, &Out);
-
- /* Copy tables */
- if (Vect_copy_tables(&In, &Out, 0))
- G_warning(_("Failed to copy attribute table to output map"));
-
/* Add column */
db_init_string(&sql);
/* Check if there is a database for output */
- nlinks = Vect_get_num_dblinks(&Out);
+ nlinks = Vect_get_num_dblinks(&Map);
if (nlinks < 1)
- Fi = Vect_default_field_info(&Out, 1, NULL, GV_1TABLE);
+ Fi = Vect_default_field_info(&Map, layer, NULL, GV_1TABLE);
else
- Fi = Vect_get_field(&Out, 1);
+ Fi = Vect_get_field(&Map, layer);
if (Fi == NULL) {
- Vect_delete(Out.name);
G_fatal_error(_("Unable to get layer info for vector map <%s>"),
- out_opt->answer);
+ map_opt->answer);
}
Driver = db_start_driver_open_database(Fi->driver, Fi->database);
if (Driver == NULL) {
- Vect_delete(Out.name);
G_fatal_error(_("Unable to open database <%s> by driver <%s>"),
Fi->database, Fi->driver);
}
+ buf[0] = '\0';
if (nlinks < 1)
sprintf(buf, "create table %s (%s integer, %s integer)", Fi->table,
Fi->key, col_opt->answer);
- else
- sprintf(buf, "alter table %s add column %s integer", Fi->table,
- col_opt->answer);
+ else {
+ dbColumn *column = NULL;
- db_set_string(&sql, buf);
+ /* check if column exists */
+ db_get_column(Driver, Fi->table, col_opt->answer, &column);
+ if (column) {
+ int sqltype = db_get_column_sqltype(column);
+ int ctype = db_sqltype_to_Ctype(sqltype);
- G_debug(3, "SQL: %s", db_get_string(&sql));
+ if (ctype != DB_C_TYPE_INT && ctype != DB_C_TYPE_DOUBLE) {
+ G_fatal_error(_("Column <%s> already exists but is not numeric"),
+ col_opt->answer);
+ }
+ }
+ else
+ sprintf(buf, "alter table %s add column %s integer", Fi->table,
+ col_opt->answer);
+ }
- if (db_execute_immediate(Driver, &sql) != DB_OK) {
- Vect_delete(Out.name);
- G_fatal_error(_("Cannot alter table: %s"), db_get_string(&sql));
+ if (*buf) {
+ db_set_string(&sql, buf);
+
+ G_debug(3, "SQL: %s", db_get_string(&sql));
+
+ if (db_execute_immediate(Driver, &sql) != DB_OK) {
+ G_fatal_error(_("Unable to alter table: %s"), db_get_string(&sql));
+ }
}
- if (nlinks < 1)
- Vect_map_add_dblink(&Out, Fi->number, Fi->name, Fi->table, Fi->key,
+
+ if (nlinks < 1) {
+ Vect_set_open_level(1);
+ Vect_close(&Map);
+ if (Vect_open_update_head(&Map, map_opt->answer, G_mapset()) < 1)
+ G_fatal_error(_("Unable to modify vector map stored in other mapset"));
+ Vect_map_add_dblink(&Map, layer, Fi->name, Fi->table, Fi->key,
Fi->database, Fi->driver);
+ Vect_close(&Map);
- /*
- * make histogram of number sites in each test partition since the
- * number of sites will not always be a multiple of the number of
- * partitions. make_histo() returns the maximum bin size.
- */
- n = make_histo(&p, np, nsites);
+ if (db_create_index2(Driver, Fi->table, Fi->key) !=
+ DB_OK)
+ G_warning(_("Cannot create index"));
- nlines = Vect_get_num_lines(&In);
- pnt_part = (int *)G_calloc(nlines + 1, sizeof(int));
+ if (db_grant_on_table(Driver, Fi->table, DB_PRIV_SELECT,
+ DB_GROUP | DB_PUBLIC) != DB_OK)
+ G_warning(_("Cannot grant privileges on table %s"),
+ Fi->table);
- maxdist = 1.1 * sqrt(pow(box.E - box.W, 2.0) + pow(box.N - box.S, 2.0));
+ G_important_message(_("Select privileges were granted on the table"));
- /* Assign fold to each point */
- for (i = 0; i < np; ++i) { /* for each partition */
- for (j = 0; j < p[i]; ++j) { /* create p[i] random points */
- int nearest = 0;
- double dist;
+ Vect_set_open_level(2);
+ if (Vect_open_old(&Map, map_opt->answer, G_mapset()) < 2) {
+ G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
+ map_opt->answer, 2);
+ }
+ }
- east = rng() / max * (box.E - box.W) + box.W;
- north = rng() / max * (box.N - box.S) + box.S;
+ /* nlines - count of records */
+ nlines = Vect_get_num_lines(&Map);
- G_debug(3, "east = %f north = %f", east, north);
+ /* last_pidx - current maximal index of partition array */
+ last_pidx = np - 1;
- /* find nearest */
- for (line = 1; line <= nlines; line++) {
- int type;
- double cur_dist;
+ /* minimum number of features in one partition */
+ min_count = nsites / np;
+ G_debug(1, "min count: %d", min_count);
- if (pnt_part[line] > 0)
- continue;
+ /* number of partitions that need min_count + 1 features */
+ spill = nsites - np * min_count; /* nsites % np, but modulus is evil */
+ G_debug(1, "spill: %d", spill);
- type = Vect_read_line(&In, Points, Cats, line);
+ /* array of available partitions */
+ part = (struct partition *)G_calloc(np, sizeof(struct partition));
+ if (!part)
+ G_fatal_error(_("Out of memory"));
- if (!(type & GV_POINT))
- continue;
+ /* line index */
+ line_idx = (int *)G_calloc(nlines, sizeof(int));
- cur_dist = hypot(Points->x[0] - east, Points->y[0] - north);
+ /* initialization of arrays */
+ for (p = 0; p < np; p++) {
+ part[p].id = p + 1; /* partition ids */
+ part[p].count = 0;
+ part[p].max = min_count;
+ }
+ for (p = 0; p < spill; p++) {
+ part[p].max++;
+ }
- if (nearest < 1 || cur_dist < dist) {
- nearest = line;
- dist = cur_dist;
- }
- }
+ for (i = 0; i < nlines; i++)
+ line_idx[i] = i + 1;
- G_debug(3, "nearest = %d", nearest);
+ /* proper randomization requires a 2-step randomization
+ * randomize in space
+ * randomize partition assignment
+ * looping sequentially through all points creates a bias */
- /* Update */
- if (nearest > 0) { /* shopuld be always */
- int cat;
+ /* process all points */
+ db_begin_transaction(Driver);
+ lines_left = nlines;
+ while (lines_left) {
+ int type, cat;
- Vect_read_line(&In, Points, Cats, nearest);
- Vect_cat_get(Cats, 1, &cat);
+ G_percent(nlines - lines_left, nlines, 4);
- if (nlinks < 1)
- sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
- Fi->table, Fi->key, col_opt->answer, cat, i + 1);
- else
- sprintf(buf, "update %s set %s = %d where %s = %d",
- Fi->table, col_opt->answer, i + 1, Fi->key, cat);
+ /* random point */
+ do {
+ idx = rng() * lines_left;
+ /* in case rng() returns 1.0: */
+ } while (idx >= lines_left);
- db_set_string(&sql, buf);
+ line = line_idx[idx];
+ lines_left--;
+ line_idx[idx] = line_idx[lines_left];
- G_debug(3, "SQL: %s", db_get_string(&sql));
+ if (!Vect_line_alive(&Map, line))
+ continue;
- if (db_execute_immediate(Driver, &sql) != DB_OK) {
- Vect_delete(Out.name);
- G_fatal_error(_("Unable to insert row: %s"),
- db_get_string(&sql));
- }
- pnt_part[nearest] = i + 1;
+ type = Vect_read_line(&Map, Points, Cats, line);
+ if (!(type & GV_POINT))
+ continue;
+
+ /* random partition for current point */
+ do {
+ p = rng() * (last_pidx + 1);
+ /* in case rng() returns 1.0: */
+ } while (p > last_pidx);
+
+ G_debug(3, "partition id = %d", part[p].id);
+
+ Vect_cat_get(Cats, 1, &cat);
+ if (cat < 0) {
+ G_warning(_("No category for line %d in layer %d"), line, layer);
+ continue;
+ }
+ if (nlinks < 1)
+ sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
+ Fi->table, Fi->key, col_opt->answer, cat,
+ part[p].id);
+ else
+ sprintf(buf, "update %s set %s = %d where %s = %d",
+ Fi->table, col_opt->answer, part[p].id,
+ Fi->key, cat);
+
+ db_set_string(&sql, buf);
+
+ G_debug(3, "SQL: %s", db_get_string(&sql));
+
+ if (db_execute_immediate(Driver, &sql) != DB_OK) {
+ G_fatal_error(_("Unable to insert row: %s"),
+ db_get_string(&sql));
+ }
+
+ /* increase count of features in partition */
+ part[p].count++;
+ /* if the partition is full */
+ if (part[p].count >= part[p].max) {
+ /* move last partition to current position */
+ if (p != last_pidx)
+ part[p] = part[last_pidx];
+
+ /* disable last partition */
+ last_pidx--;
+
+ if (last_pidx < 0 && lines_left) {
+ G_fatal_error(_("internal error"));
}
}
}
+ G_percent(1, 1, 1);
- Vect_close(&In);
-
+ db_commit_transaction(Driver);
db_close_database_shutdown_driver(Driver);
- Vect_build(&Out);
- Vect_close(&Out);
+ Vect_set_db_updated(&Map);
+ Vect_close(&Map);
exit(EXIT_SUCCESS);
}
+
+
More information about the grass-commit
mailing list