[GRASS-SVN] r57231 - grass/trunk/vector/v.kcv

svn_grass at osgeo.org svn_grass at osgeo.org
Fri Jul 19 13:53:36 PDT 2013


Author: mmetz
Date: 2013-07-19 13:53:36 -0700 (Fri, 19 Jul 2013)
New Revision: 57231

Removed:
   grass/trunk/vector/v.kcv/histo.c
   grass/trunk/vector/v.kcv/kcv.h
Modified:
   grass/trunk/vector/v.kcv/main.c
Log:
v.kvc optimization

Deleted: grass/trunk/vector/v.kcv/histo.c
===================================================================
--- grass/trunk/vector/v.kcv/histo.c	2013-07-19 14:37:41 UTC (rev 57230)
+++ grass/trunk/vector/v.kcv/histo.c	2013-07-19 20:53:36 UTC (rev 57231)
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 1993-1994. James Darrell McCauley. (darrell at mccauley-usa.com)
- *                                http://mccauley-usa.com/
- *
- * This program is free software under the GPL (>=v2)
- * Read the file GPL.TXT coming with GRASS for details.
- */
-
-#include <math.h>
-#include <grass/gis.h>
-
-int make_histo(int **p, int np, int nsites)
-{
-    int i, j;
-
-
-    /* minimum number of sites per partition */
-    j = (int)floor((double)nsites / np);
-
-    *p = (int *)G_malloc(np * sizeof(int));
-
-    for (i = 0; i < np; ++i)
-	(*p)[i] = j;
-    i = np * j;
-    j = 0;
-    while (i++ < nsites)
-	(*p)[j++]++;
-
-    /* return max number of sites per partition */
-    return (int)ceil((double)nsites / np);
-}

Deleted: grass/trunk/vector/v.kcv/kcv.h
===================================================================
--- grass/trunk/vector/v.kcv/kcv.h	2013-07-19 14:37:41 UTC (rev 57230)
+++ grass/trunk/vector/v.kcv/kcv.h	2013-07-19 20:53:36 UTC (rev 57231)
@@ -1,10 +0,0 @@
-/*
- * Copyright (C) 1993-1994. James Darrell McCauley. (darrell at mccauley-usa.com)
- *                                http://mccauley-usa.com/
- *
- * This program is free software under the GPL (>=v2)
- * Read the file GPL.TXT coming with GRASS for details.
- */
-
-/* histo.c */
-int make_histo(int **, int, int);

Modified: grass/trunk/vector/v.kcv/main.c
===================================================================
--- grass/trunk/vector/v.kcv/main.c	2013-07-19 14:37:41 UTC (rev 57230)
+++ grass/trunk/vector/v.kcv/main.c	2013-07-19 20:53:36 UTC (rev 57231)
@@ -5,6 +5,7 @@
  *
  * AUTHOR(S):  James Darrell McCauley darrell at mccauley-usa.com
  *             OGR support by Martin Landa <landa.martin gmail.com> (2009)
+ *             Spped-up by Jan Vandrol and Jan Ruzicka (2013)
  *
  * PURPOSE:    Randomly partition points into test/train sets.
  *
@@ -16,6 +17,7 @@
  *
  ****************************************************************/
 
+
 #include <stdlib.h>
 #include <unistd.h>
 #include <math.h>
@@ -23,37 +25,39 @@
 #include <grass/dbmi.h>
 #include <grass/vector.h>
 #include <grass/glocale.h>
-#include "kcv.h"
 
 static double myrand(void)
 {
-    return rand() / (1.0 * RAND_MAX);
+    return rand() / (1.0 + RAND_MAX);
 }
 
 struct Cell_head window;
 
-int main(int argc, char *argv[])
+int main(int argc, char *argv[]) 
 {
-    int line, nlines, nlinks;
-    double (*rng)(void);
-    double east, north;
-    int i, j, nsites, np, *p;
-    int *pnt_part;
-    struct Map_info In, Out;
-    static struct line_pnts *Points;
+    int i, line, nlines, nlinks;
+    int idx, *line_idx, lines_left;
+    double (*rng) ();
+    int nsites, p, np, min_count, spill;
+    struct partition {
+	int id, count, max;
+    } *part;
+    int last_pidx;
+    struct Map_info Map;
+    struct line_pnts *Points;
     struct line_cats *Cats;
     struct GModule *module;
-    struct Option *in_opt, *out_opt, *col_opt, *npart_opt, *field_opt;
+    struct Option *map_opt, *col_opt, *npart_opt, *field_opt;
     struct Flag *drand48_flag;
-    struct bound_box box;
-    double maxdist;
-
+ 
     /* Attributes */
     struct field_info *Fi;
-    dbDriver *Driver;
+    int layer;
+    dbDriver * Driver;
     char buf[2000];
     dbString sql;
 
+
     module = G_define_module();
     G_add_keyword(_("vector"));
     G_add_keyword(_("statistics"));
@@ -61,24 +65,18 @@
     module->description =
 	_("Randomly partition points into test/train sets.");
 
-    in_opt = G_define_standard_option(G_OPT_V_INPUT);
+    map_opt = G_define_standard_option(G_OPT_V_MAP);
 
-    field_opt = G_define_standard_option(G_OPT_V_FIELD_ALL);
+    field_opt = G_define_standard_option(G_OPT_V_FIELD);
 
-    out_opt = G_define_standard_option(G_OPT_V_OUTPUT);
-
     npart_opt = G_define_option();
     npart_opt->key = "k";
     npart_opt->type = TYPE_INTEGER;
     npart_opt->required = YES;
-    npart_opt->description = _("Number of partitions");
-    npart_opt->options = "1-32767";
+    npart_opt->label = _("Number of partitions");
+    npart_opt->description = _("Must be > 1");
 
-    col_opt = G_define_option();
-    col_opt->key = "column";
-    col_opt->type = TYPE_STRING;
-    col_opt->required = YES;
-    col_opt->multiple = NO;
+    col_opt = G_define_standard_option(G_OPT_DB_COLUMN);
     col_opt->answer = "part";
     col_opt->description =
 	_("Name for new column to which partition number is written");
@@ -91,12 +89,15 @@
     drand48_flag->description = _("Use drand48() (ignored)");
 #endif
 
+
     G_gisinit(argv[0]);
 
     if (G_parser(argc, argv))
 	exit(EXIT_FAILURE);
 
     np = atoi(npart_opt->answer);
+    if (np < 2)
+	G_fatal_error(_("'%s' must be > 1"), npart_opt->key);
 
 #ifdef HAVE_DRAND48
     if (drand48_flag->answer) {
@@ -115,154 +116,223 @@
 
     /* open input vector */
     Vect_set_open_level(2);
-    if (Vect_open_old2(&In, in_opt->answer, "", field_opt->answer) < 2) {
+    if (Vect_open_old2(&Map, map_opt->answer, G_mapset(), field_opt->answer) < 2) {
 	G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
-		      in_opt->answer, 2);
+		      map_opt->answer, 2);
     }
 
-    Vect_get_map_box(&In, &box);
+    layer = Vect_get_field_number(&Map, field_opt->answer);
+    if (layer <= 0)
+	G_fatal_error(_("Layer number must be positive"));
 
-    nsites = Vect_get_num_primitives(&In, GV_POINT);
+    nsites = Vect_get_num_primitives(&Map, GV_POINT);
 
     if (nsites < np) {
 	G_fatal_error(_("More partitions than points (%d)"), nsites);
     }
 
-    Vect_open_new(&Out, out_opt->answer, Vect_is_3d(&In));
-
-    Vect_copy_head_data(&In, &Out);
-    Vect_hist_copy(&In, &Out);
-    Vect_hist_command(&Out);
-
-    /* Copy vector lines */
-    Vect_copy_map_lines_field(&In, Vect_get_field_number(&In, field_opt->answer), &Out);
-
-    /* Copy tables */
-    if (Vect_copy_tables(&In, &Out, 0))
-	G_warning(_("Failed to copy attribute table to output map"));
-
     /* Add column */
     db_init_string(&sql);
 
     /* Check if there is a database for output */
-    nlinks = Vect_get_num_dblinks(&Out);
+    nlinks = Vect_get_num_dblinks(&Map);
     if (nlinks < 1)
-	Fi = Vect_default_field_info(&Out, 1, NULL, GV_1TABLE);
+	Fi = Vect_default_field_info(&Map, layer, NULL, GV_1TABLE);
     else
-	Fi = Vect_get_field(&Out, 1);
+	Fi = Vect_get_field(&Map, layer);
     if (Fi == NULL) {
-	Vect_delete(Out.name);
 	G_fatal_error(_("Unable to get layer info for vector map <%s>"),
-		      out_opt->answer);
+		      map_opt->answer);
     }
 
     Driver = db_start_driver_open_database(Fi->driver, Fi->database);
     if (Driver == NULL) {
-	Vect_delete(Out.name);
 	G_fatal_error(_("Unable to open database <%s> by driver <%s>"),
 		      Fi->database, Fi->driver);
     }
 
+    buf[0] = '\0';
     if (nlinks < 1)
 	sprintf(buf, "create table %s (%s integer, %s integer)", Fi->table,
 		Fi->key, col_opt->answer);
-    else
-	sprintf(buf, "alter table %s add column %s integer", Fi->table,
-		col_opt->answer);
+    else {
+	dbColumn *column = NULL;
 
-    db_set_string(&sql, buf);
+	/* check if column exists */
+	db_get_column(Driver, Fi->table, col_opt->answer, &column);
+	if (column) {
+            int sqltype = db_get_column_sqltype(column);
+	    int ctype = db_sqltype_to_Ctype(sqltype);
 
-    G_debug(3, "SQL: %s", db_get_string(&sql));
+	    if (ctype != DB_C_TYPE_INT && ctype != DB_C_TYPE_DOUBLE) {
+		G_fatal_error(_("Column <%s> already exists but is not numeric"),
+		              col_opt->answer);
+	    }
+	}
+	else
+	    sprintf(buf, "alter table %s add column %s integer", Fi->table,
+		    col_opt->answer);
+    }
 
-    if (db_execute_immediate(Driver, &sql) != DB_OK) {
-	Vect_delete(Out.name);
-	G_fatal_error(_("Unable to alter table: %s"), db_get_string(&sql));
+    if (*buf) {
+	db_set_string(&sql, buf);
+
+	G_debug(3, "SQL: %s", db_get_string(&sql));
+
+	if (db_execute_immediate(Driver, &sql) != DB_OK) {
+	    G_fatal_error(_("Unable to alter table: %s"), db_get_string(&sql));
+	}
     }
-    if (nlinks < 1)
-	Vect_map_add_dblink(&Out, Fi->number, Fi->name, Fi->table, Fi->key,
+
+    if (nlinks < 1) {
+	Vect_set_open_level(1);
+	Vect_close(&Map);
+	if (Vect_open_update_head(&Map, map_opt->answer, G_mapset()) < 1)
+	    G_fatal_error(_("Unable to modify vector map stored in other mapset"));
+	Vect_map_add_dblink(&Map, layer, Fi->name, Fi->table, Fi->key,
 			    Fi->database, Fi->driver);
+	Vect_close(&Map);
 
-    /*
-     * make histogram of number sites in each test partition since the
-     * number of sites will not always be a multiple of the number of
-     * partitions. make_histo() returns the maximum bin size.
-     */
-    make_histo(&p, np, nsites);
+	if (db_create_index2(Driver, Fi->table, Fi->key) !=
+	    DB_OK)
+	    G_warning(_("Cannot create index"));
 
-    nlines = Vect_get_num_lines(&In);
-    pnt_part = (int *)G_calloc(nlines + 1, sizeof(int));
+	if (db_grant_on_table(Driver, Fi->table, DB_PRIV_SELECT,
+	                      DB_GROUP | DB_PUBLIC) != DB_OK)
+	    G_warning(_("Cannot grant privileges on table %s"),
+		      Fi->table);
 
-    maxdist = 1.1 * sqrt(pow(box.E - box.W, 2.0) + pow(box.N - box.S, 2.0));
+	G_important_message(_("Select privileges were granted on the table"));
 
-    /* Assign fold to each point */
-    for (i = 0; i < np; ++i) {	/* for each partition */
-	for (j = 0; j < p[i]; ++j) {	/* create p[i] random points */
-	    int nearest = 0;
-	    double dist;
+	Vect_set_open_level(2);
+	if (Vect_open_old2(&Map, map_opt->answer, G_mapset(), field_opt->answer) < 2) {
+	    G_fatal_error(_("Unable to open vector map <%s> at topological level %d"),
+			  map_opt->answer, 2);
+	}
+    }
 
-	    east = rng() * (box.E - box.W) + box.W;
-	    north = rng() * (box.N - box.S) + box.S;
+    /* nlines - count of records */ 
+    nlines = Vect_get_num_lines(&Map);
 
-	    G_debug(3, "east = %f north = %f", east, north);
+    /* last_pidx - current maximal index of partition array */ 
+    last_pidx = np - 1;
 
-	    /* find nearest */
-	    for (line = 1; line <= nlines; line++) {
-		int type;
-		double cur_dist;
+    /* minimum number of features in one partition */
+    min_count = nsites / np;
+    G_debug(1, "min count: %d", min_count);
 
-		if (pnt_part[line] > 0)
-		    continue;
+    /* number of partitions that need min_count + 1 features */
+    spill = nsites - np * min_count;  /* nsites % np, but modulus is evil */
+    G_debug(1, "spill: %d", spill);
 
-		type = Vect_read_line(&In, Points, Cats, line);
+    /* array of available partitions */ 
+    part = (struct partition *)G_calloc(np, sizeof(struct partition));
+    if (!part)
+	G_fatal_error(_("Out of memory"));
 
-		if (!(type & GV_POINT))
-		    continue;
+    /* line index */ 
+    line_idx = (int *)G_calloc(nlines, sizeof(int));
 
-		cur_dist = hypot(Points->x[0] - east, Points->y[0] - north);
+    /* initialization of arrays */ 
+    for (p = 0; p < np; p++) {
+	part[p].id = p + 1;	/* partition ids */
+	part[p].count = 0;
+	part[p].max = min_count;
+    }
+    for (p = 0; p < spill; p++) {
+	part[p].max++;
+    }
 
-		if (nearest < 1 || cur_dist < dist) {
-		    nearest = line;
-		    dist = cur_dist;
-		}
-	    }
+    for (i = 0; i < nlines; i++)
+	line_idx[i] = i + 1;
 
-	    G_debug(3, "nearest = %d", nearest);
+    /* proper randomization requires a 2-step randomization
+     * randomize in space
+     * randomize partition assignment
+     * looping sequentially through all points creates a bias */
 
-	    /* Update */
-	    if (nearest > 0) {	/* shopuld be always */
-		int cat;
+    /* process all points */
+    db_begin_transaction(Driver);
+    lines_left = nlines;
+    while (lines_left) {
+	int type, cat;
 
-		Vect_read_line(&In, Points, Cats, nearest);
-		Vect_cat_get(Cats, 1, &cat);
+	G_percent(nlines - lines_left, nlines, 4);
 
-		if (nlinks < 1)
-		    sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
-			    Fi->table, Fi->key, col_opt->answer, cat, i + 1);
-		else
-		    sprintf(buf, "update %s set %s = %d where %s = %d",
-			    Fi->table, col_opt->answer, i + 1, Fi->key, cat);
+	/* random point */
+	do {
+	    idx = rng() * lines_left;
+	    /* in case rng() returns 1.0: */
+	} while (idx >= lines_left);
 
-		db_set_string(&sql, buf);
+	line = line_idx[idx];
+	lines_left--;
+	line_idx[idx] = line_idx[lines_left];
 
-		G_debug(3, "SQL: %s", db_get_string(&sql));
+	if (!Vect_line_alive(&Map, line))
+	    continue;
 
-		if (db_execute_immediate(Driver, &sql) != DB_OK) {
-		    Vect_delete(Out.name);
-		    G_fatal_error(_("Unable to insert row: %s"),
-				  db_get_string(&sql));
-		}
-		pnt_part[nearest] = i + 1;
+	type = Vect_read_line(&Map, Points, Cats, line);
 
+	if (!(type & GV_POINT))
+	    continue;
+
+	/* random partition for current point */
+	do {
+	    p = rng() * (last_pidx + 1);
+	    /* in case rng() returns 1.0: */
+	} while (p > last_pidx);
+
+	G_debug(3, "partition id = %d", part[p].id);
+
+	Vect_cat_get(Cats, 1, &cat);
+	if (cat < 0) {
+	    G_warning(_("No category for line %d in layer %d"), line, layer);
+	    continue;
+	}
+	if (nlinks < 1)
+	    sprintf(buf, "insert into %s (%s, %s) values (%d, %d)",
+			 Fi->table, Fi->key, col_opt->answer, cat,
+			 part[p].id);
+	else
+	    sprintf(buf, "update %s set %s = %d where %s = %d", 
+	                 Fi->table, col_opt->answer, part[p].id,
+			 Fi->key, cat);
+
+	db_set_string(&sql, buf);
+
+	G_debug(3, "SQL: %s", db_get_string(&sql));
+
+	if (db_execute_immediate(Driver, &sql) != DB_OK) {
+	    G_fatal_error(_("Unable to insert row: %s"),
+	    db_get_string(&sql));
+	}
+
+	/* increase count of features in partition */
+	part[p].count++;
+	/* if the partition is full */
+	if (part[p].count >= part[p].max) {
+	    /* move last partition to current position */
+	    if (p != last_pidx)
+		part[p] = part[last_pidx];
+
+	    /* disable last partition */
+	    last_pidx--;
+
+	    if (last_pidx < 0 && lines_left) {
+		G_fatal_error(_("internal error"));
 	    }
 	}
     }
+    G_percent(1, 1, 1);
 
-    Vect_close(&In);
-
+    db_commit_transaction(Driver);
     db_close_database_shutdown_driver(Driver);
 
-    Vect_build(&Out);
-    Vect_close(&Out);
+    Vect_set_db_updated(&Map);
+    Vect_close(&Map);
 
     exit(EXIT_SUCCESS);
 }
+
+



More information about the grass-commit mailing list