[GRASS-SVN] r57231 - grass/trunk/vector/v.kcv
svn_grass at osgeo.org
svn_grass at osgeo.org
Fri Jul 19 13:53:36 PDT 2013
Author: mmetz
Date: 2013-07-19 13:53:36 -0700 (Fri, 19 Jul 2013)
New Revision: 57231
Removed:
grass/trunk/vector/v.kcv/histo.c
grass/trunk/vector/v.kcv/kcv.h
Modified:
grass/trunk/vector/v.kcv/main.c
Log:
v.kvc optimization
Deleted: grass/trunk/vector/v.kcv/histo.c
===================================================================
--- grass/trunk/vector/v.kcv/histo.c 2013-07-19 14:37:41 UTC (rev 57230)
+++ grass/trunk/vector/v.kcv/histo.c 2013-07-19 20:53:36 UTC (rev 57231)
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 1993-1994. James Darrell McCauley. (darrell at mccauley-usa.com)
- * http://mccauley-usa.com/
- *
- * This program is free software under the GPL (>=v2)
- * Read the file GPL.TXT coming with GRASS for details.
- */
-
-#include <math.h>
-#include <grass/gis.h>
-
-int make_histo(int **p, int np, int nsites)
-{
- int i, j;
-
-
- /* minimum number of sites per partition */
- j = (int)floor((double)nsites / np);
-
- *p = (int *)G_malloc(np * sizeof(int));
-
- for (i = 0; i < np; ++i)
- (*p)[i] = j;
- i = np * j;
- j = 0;
- while (i++ < nsites)
- (*p)[j++]++;
-
- /* return max number of sites per partition */
- return (int)ceil((double)nsites / np);
-}
Deleted: grass/trunk/vector/v.kcv/kcv.h
===================================================================
--- grass/trunk/vector/v.kcv/kcv.h 2013-07-19 14:37:41 UTC (rev 57230)
+++ grass/trunk/vector/v.kcv/kcv.h 2013-07-19 20:53:36 UTC (rev 57231)
@@ -1,10 +0,0 @@
-/*
- * Copyright (C) 1993-1994. James Darrell McCauley. (darrell at mccauley-usa.com)
- * http://mccauley-usa.com/
- *
- * This program is free software under the GPL (>=v2)
- * Read the file GPL.TXT coming with GRASS for details.
- */
-
-/* histo.c */
-int make_histo(int **, int, int);
Modified: grass/trunk/vector/v.kcv/main.c
===================================================================
--- grass/trunk/vector/v.kcv/main.c 2013-07-19 14:37:41 UTC (rev 57230)
+++ grass/trunk/vector/v.kcv/main.c 2013-07-19 20:53:36 UTC (rev 57231)
@@ -5,6 +5,7 @@
*
* AUTHOR(S): James Darrell McCauley darrell at mccauley-usa.com
* OGR support by Martin Landa <landa.martin gmail.com> (2009)
+ * Spped-up by Jan Vandrol and Jan Ruzicka (2013)
*
* PURPOSE: Randomly partition points into test/train sets.
*
@@ -16,6 +17,7 @@
*
****************************************************************/
+
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
@@ -23,37 +25,39 @@
#include <grass/dbmi.h>
#include <grass/vector.h>
#include <grass/glocale.h>
-#include "kcv.h"
static double myrand(void)
{
- return rand() / (1.0 * RAND_MAX);
+ return rand() / (1.0 + RAND_MAX);
}
struct Cell_head window;
-int main(int argc, char *argv[])
+int main(int argc, char *argv[])
{
- int line, nlines, nlinks;
- double (*rng)(void);
- double east, north;
- int i, j, nsites, np, *p;
- int *pnt_part;
- struct Map_info In, Out;
- static struct line_pnts *Points;
+ int i, line, nlines, nlinks;
+ int idx, *line_idx, lines_left;
+ double (*rng) ();
+ int nsites, p, np, min_count, spill;
+ struct partition {
+ int id, count, max;
+ } *part;
+ int last_pidx;
+ struct Map_info Map;
+ struct line_pnts *Points;
struct line_cats *Cats;
struct GModule *module;
- struct Option *in_opt, *out_opt, *col_opt, *npart_opt, *field_opt;
+ struct Option *map_opt, *col_opt, *npart_opt, *field_opt;
struct Flag *drand48_flag;
- struct bound_box box;
- double maxdist;
-
+
/* Attributes */
struct field_info *Fi;
- dbDriver *Driver;
+ int layer;
+ dbDriver * Driver;
char buf[2000];
dbString sql;
+
module = G_define_module();
G_add_keyword(_("vector"));
G_add_keyword(_("statistics"));
@@ -61,24 +65,18 @@
module->description =
_("Randomly partition points into test/train sets.");
- in_opt = G_define_standard_option(G_OPT_V_INPUT);
+ map_opt = G_define_standard_option(G_OPT_V_MAP);
- field_opt = G_define_standard_option(G_OPT_V_FIELD_ALL);
+ field_opt = G_define_standard_option(G_OPT_V_FIELD);
- out_opt = G_define_standard_option(G_OPT_V_OUTPUT);
-
npart_opt = G_define_option();
npart_opt->key = "k";
npart_opt->type = TYPE_INTEGER;
npart_opt->required = YES;
- npart_opt->description = _("Number of partitions");
- npart_opt->options = "1-32767";
+ npart_opt->label = _("Number of partitions");
+ npart_opt->description = _("Must be > 1");
- col_opt = G_define_option();
- col_opt->key = "column";
- col_opt->type = TYPE_STRING;
- col_opt->required = YES;
- col_opt->multiple = NO;
+ col_opt = G_define_standard_option(G_OPT_DB_COLUMN);
col_opt->answer = "part";
col_opt->description =
_("Name for new column to which partition number is written");
@@ -91,12 +89,15 @@
drand48_flag->description = _("Use drand48() (ignored)");
#endif
+
G_gisinit(argv[0]);
if (G_parser(argc, argv))
exit(EXIT_FAILURE);
np = atoi(npart_opt->answer);
+ if (np < 2)
+ G_fatal_error(_("'%s' must be > 1"), npart_opt->key);
#ifdef HAVE_DRAND48
if (drand48_flag->answer) {
@@ -115,154 +116,223 @@
/* open input vector */
Vect_set_open_level(2);
- if (Vect_open_old2(&In, in_opt->answer, "", field_opt->answer) < 2) {
+ if (Vect_open_old2(&Map, map_opt->answer, G_mapset(), field_opt->answer) < 2) {
G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
- in_opt->answer, 2);
+ map_opt->answer, 2);
}
- Vect_get_map_box(&In, &box);
+ layer = Vect_get_field_number(&Map, field_opt->answer);
+ if (layer <= 0)
+ G_fatal_error(_("Layer number must be positive"));
- nsites = Vect_get_num_primitives(&In, GV_POINT);
+ nsites = Vect_get_num_primitives(&Map, GV_POINT);
if (nsites < np) {
G_fatal_error(_("More partitions than points (%d)"), nsites);
}
- Vect_open_new(&Out, out_opt->answer, Vect_is_3d(&In));
-
- Vect_copy_head_data(&In, &Out);
- Vect_hist_copy(&In, &Out);
- Vect_hist_command(&Out);
-
- /* Copy vector lines */
- Vect_copy_map_lines_field(&In, Vect_get_field_number(&In, field_opt->answer), &Out);
-
- /* Copy tables */
- if (Vect_copy_tables(&In, &Out, 0))
- G_warning(_("Failed to copy attribute table to output map"));
-
/* Add column */
db_init_string(&sql);
/* Check if there is a database for output */
- nlinks = Vect_get_num_dblinks(&Out);
+ nlinks = Vect_get_num_dblinks(&Map);
if (nlinks < 1)
- Fi = Vect_default_field_info(&Out, 1, NULL, GV_1TABLE);
+ Fi = Vect_default_field_info(&Map, layer, NULL, GV_1TABLE);
else
- Fi = Vect_get_field(&Out, 1);
+ Fi = Vect_get_field(&Map, layer);
if (Fi == NULL) {
- Vect_delete(Out.name);
G_fatal_error(_("Unable to get layer info for vector map <%s>"),
- out_opt->answer);
+ map_opt->answer);
}
Driver = db_start_driver_open_database(Fi->driver, Fi->database);
if (Driver == NULL) {
- Vect_delete(Out.name);
G_fatal_error(_("Unable to open database <%s> by driver <%s>"),
Fi->database, Fi->driver);
}
+ buf[0] = '\0';
if (nlinks < 1)
sprintf(buf, "create table %s (%s integer, %s integer)", Fi->table,
Fi->key, col_opt->answer);
- else
- sprintf(buf, "alter table %s add column %s integer", Fi->table,
- col_opt->answer);
+ else {
+ dbColumn *column = NULL;
- db_set_string(&sql, buf);
+ /* check if column exists */
+ db_get_column(Driver, Fi->table, col_opt->answer, &column);
+ if (column) {
+ int sqltype = db_get_column_sqltype(column);
+ int ctype = db_sqltype_to_Ctype(sqltype);
- G_debug(3, "SQL: %s", db_get_string(&sql));
+ if (ctype != DB_C_TYPE_INT && ctype != DB_C_TYPE_DOUBLE) {
+ G_fatal_error(_("Column <%s> already exists but is not numeric"),
+ col_opt->answer);
+ }
+ }
+ else
+ sprintf(buf, "alter table %s add column %s integer", Fi->table,
+ col_opt->answer);
+ }
- if (db_execute_immediate(Driver, &sql) != DB_OK) {
- Vect_delete(Out.name);
- G_fatal_error(_("Unable to alter table: %s"), db_get_string(&sql));
+ if (*buf) {
+ db_set_string(&sql, buf);
+
+ G_debug(3, "SQL: %s", db_get_string(&sql));
+
+ if (db_execute_immediate(Driver, &sql) != DB_OK) {
+ G_fatal_error(_("Unable to alter table: %s"), db_get_string(&sql));
+ }
}
- if (nlinks < 1)
- Vect_map_add_dblink(&Out, Fi->number, Fi->name, Fi->table, Fi->key,
+
+ if (nlinks < 1) {
+ Vect_set_open_level(1);
+ Vect_close(&Map);
+ if (Vect_open_update_head(&Map, map_opt->answer, G_mapset()) < 1)
+ G_fatal_error(_("Unable to modify vector map stored in other mapset"));
+ Vect_map_add_dblink(&Map, layer, Fi->name, Fi->table, Fi->key,
Fi->database, Fi->driver);
+ Vect_close(&Map);
- /*
- * make histogram of number sites in each test partition since the
- * number of sites will not always be a multiple of the number of
- * partitions. make_histo() returns the maximum bin size.
- */
- make_histo(&p, np, nsites);
+ if (db_create_index2(Driver, Fi->table, Fi->key) !=
+ DB_OK)
+ G_warning(_("Cannot create index"));
- nlines = Vect_get_num_lines(&In);
- pnt_part = (int *)G_calloc(nlines + 1, sizeof(int));
+ if (db_grant_on_table(Driver, Fi->table, DB_PRIV_SELECT,
+ DB_GROUP | DB_PUBLIC) != DB_OK)
+ G_warning(_("Cannot grant privileges on table %s"),
+ Fi->table);
- maxdist = 1.1 * sqrt(pow(box.E - box.W, 2.0) + pow(box.N - box.S, 2.0));
+ G_important_message(_("Select privileges were granted on the table"));
- /* Assign fold to each point */
- for (i = 0; i < np; ++i) { /* for each partition */
- for (j = 0; j < p[i]; ++j) { /* create p[i] random points */
- int nearest = 0;
- double dist;
+ Vect_set_open_level(2);
+ if (Vect_open_old2(&Map, map_opt->answer, G_mapset(), field_opt->answer) < 2) {
+ G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
+ map_opt->answer, 2);
+ }
+ }
- east = rng() * (box.E - box.W) + box.W;
- north = rng() * (box.N - box.S) + box.S;
+ /* nlines - count of records */
+ nlines = Vect_get_num_lines(&Map);
- G_debug(3, "east = %f north = %f", east, north);
+ /* last_pidx - current maximal index of partition array */
+ last_pidx = np - 1;
- /* find nearest */
- for (line = 1; line <= nlines; line++) {
- int type;
- double cur_dist;
+ /* minimum number of features in one partition */
+ min_count = nsites / np;
+ G_debug(1, "min count: %d", min_count);
- if (pnt_part[line] > 0)
- continue;
+ /* number of partitions that need min_count + 1 features */
+ spill = nsites - np * min_count; /* nsites % np, but modulus is evil */
+ G_debug(1, "spill: %d", spill);
- type = Vect_read_line(&In, Points, Cats, line);
+ /* array of available partitions */
+ part = (struct partition *)G_calloc(np, sizeof(struct partition));
+ if (!part)
+ G_fatal_error(_("Out of memory"));
- if (!(type & GV_POINT))
- continue;
+ /* line index */
+ line_idx = (int *)G_calloc(nlines, sizeof(int));
- cur_dist = hypot(Points->x[0] - east, Points->y[0] - north);
+ /* initialization of arrays */
+ for (p = 0; p < np; p++) {
+ part[p].id = p + 1; /* partition ids */
+ part[p].count = 0;
+ part[p].max = min_count;
+ }
+ for (p = 0; p < spill; p++) {
+ part[p].max++;
+ }
- if (nearest < 1 || cur_dist < dist) {
- nearest = line;
- dist = cur_dist;
- }
- }
+ for (i = 0; i < nlines; i++)
+ line_idx[i] = i + 1;
- G_debug(3, "nearest = %d", nearest);
+ /* proper randomization requires a 2-step randomization
+ * randomize in space
+ * randomize partition assignment
+ * looping sequentially through all points creates a bias */
- /* Update */
- if (nearest > 0) { /* shopuld be always */
- int cat;
+ /* process all points */
+ db_begin_transaction(Driver);
+ lines_left = nlines;
+ while (lines_left) {
+ int type, cat;
- Vect_read_line(&In, Points, Cats, nearest);
- Vect_cat_get(Cats, 1, &cat);
+ G_percent(nlines - lines_left, nlines, 4);
- if (nlinks < 1)
- sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
- Fi->table, Fi->key, col_opt->answer, cat, i + 1);
- else
- sprintf(buf, "update %s set %s = %d where %s = %d",
- Fi->table, col_opt->answer, i + 1, Fi->key, cat);
+ /* random point */
+ do {
+ idx = rng() * lines_left;
+ /* in case rng() returns 1.0: */
+ } while (idx >= lines_left);
- db_set_string(&sql, buf);
+ line = line_idx[idx];
+ lines_left--;
+ line_idx[idx] = line_idx[lines_left];
- G_debug(3, "SQL: %s", db_get_string(&sql));
+ if (!Vect_line_alive(&Map, line))
+ continue;
- if (db_execute_immediate(Driver, &sql) != DB_OK) {
- Vect_delete(Out.name);
- G_fatal_error(_("Unable to insert row: %s"),
- db_get_string(&sql));
- }
- pnt_part[nearest] = i + 1;
+ type = Vect_read_line(&Map, Points, Cats, line);
+ if (!(type & GV_POINT))
+ continue;
+
+ /* random partition for current point */
+ do {
+ p = rng() * (last_pidx + 1);
+ /* in case rng() returns 1.0: */
+ } while (p > last_pidx);
+
+ G_debug(3, "partition id = %d", part[p].id);
+
+ Vect_cat_get(Cats, 1, &cat);
+ if (cat < 0) {
+ G_warning(_("No category for line %d in layer %d"), line, layer);
+ continue;
+ }
+ if (nlinks < 1)
+ sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
+ Fi->table, Fi->key, col_opt->answer, cat,
+ part[p].id);
+ else
+ sprintf(buf, "update %s set %s = %d where %s = %d",
+ Fi->table, col_opt->answer, part[p].id,
+ Fi->key, cat);
+
+ db_set_string(&sql, buf);
+
+ G_debug(3, "SQL: %s", db_get_string(&sql));
+
+ if (db_execute_immediate(Driver, &sql) != DB_OK) {
+ G_fatal_error(_("Unable to insert row: %s"),
+ db_get_string(&sql));
+ }
+
+ /* increase count of features in partition */
+ part[p].count++;
+ /* if the partition is full */
+ if (part[p].count >= part[p].max) {
+ /* move last partition to current position */
+ if (p != last_pidx)
+ part[p] = part[last_pidx];
+
+ /* disable last partition */
+ last_pidx--;
+
+ if (last_pidx < 0 && lines_left) {
+ G_fatal_error(_("internal error"));
}
}
}
+ G_percent(1, 1, 1);
- Vect_close(&In);
-
+ db_commit_transaction(Driver);
db_close_database_shutdown_driver(Driver);
- Vect_build(&Out);
- Vect_close(&Out);
+ Vect_set_db_updated(&Map);
+ Vect_close(&Map);
exit(EXIT_SUCCESS);
}
+
+
More information about the grass-commit
mailing list