[GRASS-SVN] r62688 - grass-addons/grass7/vector/v.class.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Sun Nov 9 10:19:12 PST 2014
Author: zarch
Date: 2014-11-09 10:19:12 -0800 (Sun, 09 Nov 2014)
New Revision: 62688
Modified:
grass-addons/grass7/vector/v.class.ml/ml_classifiers.py
grass-addons/grass7/vector/v.class.ml/ml_functions.py
grass-addons/grass7/vector/v.class.ml/npy2table.py
grass-addons/grass7/vector/v.class.ml/sqlite2npy.py
grass-addons/grass7/vector/v.class.ml/training_extraction.py
grass-addons/grass7/vector/v.class.ml/v.class.ml.py
Log:
v.class.ml: fix bugs
Modified: grass-addons/grass7/vector/v.class.ml/ml_classifiers.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_classifiers.py 2014-11-09 18:08:55 UTC (rev 62687)
+++ grass-addons/grass7/vector/v.class.ml/ml_classifiers.py 2014-11-09 18:19:12 UTC (rev 62688)
@@ -23,9 +23,9 @@
from sklearn import metrics
-from grass.pygrass.messages import Messenger
+from grass.pygrass.messages import get_msgr
-MSGR = Messenger()
+MSGR = get_msgr()
try:
import mlpy
Modified: grass-addons/grass7/vector/v.class.ml/ml_functions.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_functions.py 2014-11-09 18:08:55 UTC (rev 62687)
+++ grass-addons/grass7/vector/v.class.ml/ml_functions.py 2014-11-09 18:19:12 UTC (rev 62688)
@@ -71,8 +71,8 @@
cls['t_acc'] = metrics.accuracy_score(sol, pred, normalize=True)
lab = [labels[key] for key in clsses]
- cls['report'] = metrics.classification_report(sol, pred, lab)
- cls['confusion'] = metrics.confusion_matrix(sol, pred, lab)
+ cls['report'] = metrics.classification_report(sol, pred, target_names=lab)
+ cls['confusion'] = metrics.confusion_matrix(sol, pred)
c_acc = []
for c in clsses:
indx = (sol == c).nonzero()
@@ -106,7 +106,7 @@
cls['pred_stop'] = time.time()
print_test(cls, save=report)
report.write('\n' + cls['report'])
- report.write('\n' + cls['confusion'])
+ report.write('\n' + str(cls['confusion']))
np.save(cls['name'] + '.npy', cls['predict'])
@@ -136,7 +136,7 @@
def optimize_training(cls, tdata, tclss, labels,
- scaler=None, num=None, maxiterations=1000):
+ scaler=None, decmp=None, num=None, maxiterations=1000):
best = cls.copy()
best['c_acc_mean'] = 0
means = []
@@ -144,11 +144,19 @@
for i in range(maxiterations): # TODO: use multicore
#msgr.percent(i, maxiterations, 1)
Xt, Yt = balance(tdata, tclss, num)
+ stdata = None
+ sXt = None
if scaler:
scaler.fit(Xt, Yt)
sXt = scaler.transform(Xt)
stdata = scaler.transform(tdata)
- else:
+ if decmp:
+ sXt = sXt if sXt else Xt
+ stdata = stdata if stdata else tdata
+ decmp.fit(sXt)
+ sXt = decmp.transform(sXt)
+ stdata = decmp.transform(stdata)
+ if scaler is None and decmp is None:
sXt, stdata = Xt, tdata
test_classifier(cls, sXt, Yt, stdata, tclss, labels, verbose=False)
if cls['c_acc_mean'] > best['c_acc_mean']:
@@ -197,15 +205,13 @@
fig.savefig("bv__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
-def plot_confusion_matrix(cm, labels, name, fmt='png', **kwargs):
- conf = cm.sum(axis=2)
- conf /= conf.sum(axis=1)
+def plot_confusion_matrix(cnf, labels, name, fmt='png', **kwargs):
fig, ax = plt.subplots(figsize=(6, 5))
- img = ax.imshow(conf, cmap=CMAP)
+ img = ax.imshow(cnf, interpolation='nearest', cmap=CMAP)
fig.colorbar(img)
ticks = range(len(labels))
ax.set_xticks(ticks)
- ax.set_xticklabels(labels)
+ ax.set_xticklabels(labels, rotation=90)
ax.xaxis.set_ticks_position("bottom")
ax.set_yticks(ticks)
ax.set_yticklabels(labels)
@@ -287,7 +293,6 @@
train_errors, test_errors, scores, cms = [], [], [], []
lk = {l: {k: [] for k in keys} for l in clss}
clf = cls['classifier'](**cls['kwargs'])
- import ipdb; ipdb.set_trace()
for train, test in cv:
X_train, y_train = tdata[train], tclss[train]
X_test, y_test = tdata[test], tclss[test]
@@ -302,13 +307,13 @@
test_errors.append(1 - test_score)
y_pred = clf.predict(X_test)
- cms.append(confusion_matrix(y_test, y_pred, lbs))
+ cms.append(confusion_matrix(y_test, y_pred))
# get probability
proba = clf.predict_proba(X_test)
# compute score for each class VS rest
for idx, label in enumerate(clss):
fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
- precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
+ precision, recall, pr_thr = prc(y_test==label, proba[:, idx], label)
lk[label]['fprs'].append(fpr)
lk[label]['tprs'].append(tpr)
lk[label]['roc_scores'].append(auc(fpr, tpr))
@@ -338,15 +343,18 @@
lk[cl]['tprs'][median],
lk[cl]['roc_scores'][median],
name=name, label=labels[cl])
- plot_confusion_matrix(cls['confusion matrix'],
+ cnf = np.array(cls['confusion matrix'], dtype=np.float)
+ sc = cnf.sum(axis=0)
+ norm = sc / sc.sum(axis=1)[:, None]
+ plot_confusion_matrix(norm,
labels=[labels[cl] for cl in clss],
- name=cls['name'])
+ name=cls['name'], **kwargs)
def explorer_clsfiers(clsses, Xd, Yd, labels, indexes=None, n_folds=5,
bv=False, extra=False):
gen = zip(indexes, clsses) if indexes else enumerate(clsses)
- cv = StratifiedKFold(Yd, n_folds=n_folds)
+ cv = StratifiedKFold(Yd, n_folds=n_folds, shuffle=True)
fmt = '%5d %-30s %6.4f %6.4f %6.4f %6.4f'
res = []
kw = dict(bbox_inches="tight", dpi=300)
@@ -377,13 +385,11 @@
train_clr='b', test_clr='r', alpha=0.2,
fmt='png', **kw)
if extra:
- import ipdb; ipdb.set_trace()
extra_analysis(cls, Xd, Yd, labels)
plot_extra(cls, labels, **kw)
with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
pk.dump(cls, pkl)
except:
- #import ipdb; ipdb.set_trace()
#print('problem with: %s' % cls['name'])
pass
return np.array(res, dtype=SCORES_DTYPE)
@@ -406,9 +412,10 @@
def plot_grid(grid, save=''):
- C = grid.param_grid['C']
- gamma = grid.param_grid['gamma']
- kernels = grid.param_grid['kernel']
+ C = grid.param_grid.get('C', 0)
+ gamma = grid.param_grid.get('gamma', 0)
+ kernels = grid.param_grid.get('kernel', 0)
+ degrees = grid.param_grid.get('degree', None)
for kernel in kernels:
scores = [x[1] for x in grid.grid_scores_ if x[0]['kernel'] == kernel]
scores = np.array(scores).reshape(len(C), len(gamma))
@@ -423,8 +430,7 @@
ax.set_xticklabels(gamma, rotation=45)
ax.set_yticks(np.arange(len(C)))
ax.set_yticklabels(C)
-# if kernel == 'poly':
-# import ipdb; ipdb.set_trace()
+
ic, igamma = np.unravel_index(np.argmax(scores), scores.shape)
ax.plot(igamma, ic, 'r.')
best = scores[ic, igamma]
@@ -439,7 +445,7 @@
def explore_SVC(Xt, Yt, n_folds=3, n_jobs=1, **kwargs):
- cv = StratifiedKFold(y=Yt, n_folds=n_folds)
+ cv = StratifiedKFold(y=Yt, n_folds=n_folds, shuffle=True)
grid = GridSearchCV(SVC(), param_grid=kwargs, cv=cv, n_jobs=n_jobs,
verbose=2)
grid.fit(Xt, Yt)
Modified: grass-addons/grass7/vector/v.class.ml/npy2table.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/npy2table.py 2014-11-09 18:08:55 UTC (rev 62687)
+++ grass-addons/grass7/vector/v.class.ml/npy2table.py 2014-11-09 18:19:12 UTC (rev 62688)
@@ -32,11 +32,17 @@
table.conn.commit()
-def export2onesqlite(table, cats, *clsses):
+def export2onesqlite(table, cats, update='', *clsses):
cur = table.conn.cursor()
- print("Insert data")
- table.insert(zip(cats, *clsses), cursor=cur, many=True)
- cur.close()
+ if update:
+ print("Update table inserting classification data")
+ print(update)
+ clsses.append(cats)
+ table.execute(update, many=True, values=zip(*clsses))
+ else:
+ print("Insert data")
+ table.insert(zip(cats, *clsses), cursor=cur, many=True)
+ cur.close()
table.conn.commit()
@@ -59,7 +65,8 @@
def export_results(vect_name, results, cats, rlayer,
- training=None, cols=None, overwrite=False, pkl=None):
+ training=None, cols=None, overwrite=False, append=False,
+ pkl=None):
if pkl:
res = open(pkl, 'w')
pickle.dump(results, res)
@@ -68,21 +75,33 @@
# check if the link already exist
with Vector(vect_name, mode='r') as vct:
link = vct.dblinks.by_name(rlayer)
- mode = 'r' if link else 'rw'
+ mode = 'r' if link else 'w'
print("Opening vector <%s>" % vect_name)
with Vector(vect_name, mode=mode) as vect:
if cols:
cols.insert(0, COLS[0])
tab = link.table() if link else Table(rlayer, vect.table.conn)
- if tab.exist():
- print("Table <%s> already exist, will be removed." % tab.name)
- tab.drop(force=overwrite)
- print("Ceating a new table <%s>." % rlayer)
- import ipdb; ipdb.set_trace()
- tab.create(cols)
- export2onesqlite(tab, cats, *[cls['predict'] for cls in results])
- if mode == 'rw':
+ if tab.exist() and append:
+ columns_to_up = []
+ # add the column to the table
+ for cname, ctype in cols:
+ columns_to_up.append("%s=?" % cname)
+ if cname not in tab.columns:
+ tab.columns.add(cname, ctype)
+ upsql = "UPDATE %s SET %s WHERE %s=%s"
+ up = upsql % (tab.name, ','.join(columns_to_up), tab.key, '?')
+ else:
+ if tab.exist():
+ print("Table <%s> already exist, will be removed." % tab.name)
+ tab.drop(force=True)
+ print("Ceating a new table <%s>." % rlayer)
+ tab.create(cols)
+ up = ''
+
+ export2onesqlite(tab, cats.astype(int), up,
+ *[cls['predict'].astype(int) for cls in results])
+ if mode == 'w':
nlyr = len(vect.dblinks) + 1
link = Link(nlyr, tab.name, tab.name)
vect.dblinks.add(link)
Modified: grass-addons/grass7/vector/v.class.ml/sqlite2npy.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/sqlite2npy.py 2014-11-09 18:08:55 UTC (rev 62687)
+++ grass-addons/grass7/vector/v.class.ml/sqlite2npy.py 2014-11-09 18:19:12 UTC (rev 62688)
@@ -17,7 +17,7 @@
FTDATA = 'training_data.npy'
-def cpdata(shape, iterator, msg=''):
+def cpdata(shape, iterator, dtype=float, msg=''):
"""Avoid to create a python list and then convert the python list to a
numpy array. This function instantiate statically a numpy array and then
fill the numpy array with the data coming from the generator to reduce
@@ -26,7 +26,7 @@
#msgr = ???
#msgr.message(msg)
print(msg)
- dt = np.zeros(shape)
+ dt = np.empty(shape, dtype=dtype)
for i, data in enumerate(iterator):
#msgr.percent(i, nrows, 2)
dt[i] = data
@@ -80,10 +80,10 @@
dta = cpdata(shape, data.execute(slct_data), msg=slct_data)
# extract the cats
- slct_cats = "SELECT {cat} FROM {tname};".format(cat=data.key,
- tname=data.name)
+ slct_cats = "SELECT {cat} FROM {tname};".format(cat=trng.key,
+ tname=trng.name)
cats = cpdata((n_data, ), (c[0] for c in data.execute(slct_cats)),
- msg=slct_cats)
+ dtype=int, msg=slct_cats)
# cats = np.array([c[0] for c in data.execute(slct_cats)])
# training samples
Modified: grass-addons/grass7/vector/v.class.ml/training_extraction.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/training_extraction.py 2014-11-09 18:08:55 UTC (rev 62687)
+++ grass-addons/grass7/vector/v.class.ml/training_extraction.py 2014-11-09 18:19:12 UTC (rev 62688)
@@ -159,6 +159,7 @@
msgr = get_msgr()
tname, tmset = tvect.split('@') if '@' in tvect else (tvect, '')
vname, vmset = vect.split('@') if '@' in vect else (vect, '')
+
with VectorTopo(tname, tmset, mode='r') as trn:
with VectorTopo(vname, vmset, mode='r') as vct:
layer_num, layer_name = get_layer_num_name(vct, tlayer)
@@ -167,7 +168,7 @@
seg_area = Area(c_mapinfo=vct.c_mapinfo)
n_areas = trn.number_of('areas')
# check/remove/create a new table
- table, create_link = make_new_table(vct, layer_name)
+ table, create_link = make_new_table(vct, layer_name, force=True)
find_lines(table, [l for l in trn.viter('lines')], vct)
# find and save all the segments
find_area(table, trn.viter('areas', idonly=True),
Modified: grass-addons/grass7/vector/v.class.ml/v.class.ml.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/v.class.ml.py 2014-11-09 18:08:55 UTC (rev 62687)
+++ grass-addons/grass7/vector/v.class.ml/v.class.ml.py 2014-11-09 18:19:12 UTC (rev 62688)
@@ -209,7 +209,7 @@
#%end
#%option
#% key: posinf
-#% type: double
+#% type: string
#% multiple: yes
#% description: Key:Value or Numpy funtion to use to substitute posinf values
#% required: no
@@ -256,6 +256,14 @@
#% answer: linear,poly,rbf,sigmoid
#%end
#%option
+#% key: svc_poly_range
+#% type: string
+#% multiple: yes
+#% description: polynomial order list to explore SVC domain
+#% required: no
+#% answer:
+#%end
+#%option
#% key: svc_n_jobs
#% type: integer
#% multiple: no
@@ -346,6 +354,10 @@
#% key: d
#% description: Explore the SVC domain
#%end
+#%flag
+#% key: a
+#% description: append the classification results
+#%end
#-----------------------------------------------------
from __future__ import (absolute_import, division, print_function,
unicode_literals)
@@ -420,7 +432,8 @@
def get_colors(vtraining):
- with Vector(vtraining, mode='r') as vct:
+ vect, mset = vtraining.split('@') if '@' in vtraining else (vtraining, '')
+ with Vector(vect, mapset=mset, mode='r') as vct:
cur = vct.table.execute('SELECT cat, color FROM %s;' % vct.name)
return dict([c for c in cur.fetchall()])
@@ -457,12 +470,13 @@
if report:
indent = ' '
tot = len(array)
- for key in special:
+ for k in special:
fmt = '- %15s (%3d/%d, %4.3f%%)'
- strs = [fmt % (col, cnt, tot, cnt/float(tot)*100)
- for col, cnt in zip(cols[np.array(sp[key])], cntr[key])]
- print('%s:\n%s' % (key, indent), ('\n%s' % indent).join(strs),
- sep='')
+ if sp[k]:
+ strs = [fmt % (col, cnt, tot, cnt/float(tot)*100)
+ for col, cnt in zip(cols[np.array(sp[k])], cntr[k])]
+ print('%s:\n%s' % (k, indent), ('\n%s' % indent).join(strs),
+ sep='')
return sp
@@ -495,12 +509,12 @@
indexes = None
vect = opt['vector']
vtraining = opt['vtraining'] if opt['vtraining'] else None
- scaler = None
+ scaler, decmp = None, None
vlayer = opt['vlayer'] if opt['vlayer'] else vect + '_stats'
tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training'
rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results'
- labels = extract_classes(vtraining, vlayer)
+ labels = extract_classes(vtraining, 1)
pprint(labels)
if opt['scalar']:
@@ -509,15 +523,16 @@
with_std='with_std' in scapar)
if opt['decomposition']:
- decmp, params = (opt['decomposition'].split('|')
- if '|' in opt['decomposition']
- else (opt['decomposition'], ''))
+ dec, params = (opt['decomposition'].split('|')
+ if '|' in opt['decomposition']
+ else (opt['decomposition'], ''))
kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))}
if params else {})
- dec = DECMP[decmp](**kwargs)
+ decmp = DECMP[dec](**kwargs)
+
# if training extract training
if vtraining and flg['e']:
- msgr.message("Extract training from: <%s>." % vtraining)
+ msgr.message("Extract training from: <%s> to <%s>." % (vtraining, vect))
extract_training(vect, vtraining, tlayer)
flg['n'] = True
@@ -565,9 +580,23 @@
# Substitute (skip cat column)
Xt, rules_vals = substitute(Xt, rules, cols[1:])
+ Xtoriginal = Xt
+ # scale the data
+ if scaler:
+ msgr.message("Scaling the training data set.")
+ scaler.fit(Xt, Yt)
+ Xt = scaler.transform(Xt)
+
+ # decompose data
+ if decmp:
+ msgr.message("Decomposing the training data set.")
+ decmp.fit(Xt)
+ Xt = decmp.transform(Xt)
+
# Feature importances with forests of trees
if flg['f']:
+ np.save('training_transformed.npy', Xt)
importances(Xt, Yt, cols[1:],
csv=opt['imp_csv'], img=opt['imp_fig'],
# default parameters to save the matplotlib figure
@@ -581,7 +610,7 @@
msgr.message("Find the optimum training set.")
best, Xbt, Ybt = optimize_training(cls, Xt, Yt,
labels, #{v: k for k, v in labels.items()},
- scaler,
+ scaler, decmp,
num=num, maxiterations=1000)
msg = " - save the optimum training data set to: %s."
msgr.message(msg % opt['npy_btdata'])
@@ -610,17 +639,27 @@
Xbt = scaler.transform(Xbt)
if flg['d']:
- C_range = [float(c) for c in opt['svc_c_range'].split(',')]
- gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',')]
- kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',')]
- poly_range = [int(i) for i in opt['poly_range'].split(',')]
+ C_range = [float(c) for c in opt['svc_c_range'].split(',') if c]
+ gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',') if g]
+ kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',') if s]
+ poly_range = [int(i) for i in opt['svc_poly_range'].split(',') if i]
+ allkwargs = dict(C=C_range, gamma=gamma_range,
+ kernel=kernel_range, degree=poly_range)
+ kwargs = {}
+ for k in allkwargs:
+ if allkwargs[k]:
+ kwargs[k] = allkwargs[k]
msgr.message("Exploring the SVC domain.")
- grid = explore_SVC(Xbt, Ybt, n_folds=3, n_jobs=int(opt['svc_n_jobs']),
- C=C_range, gamma=gamma_range, kernel=kernel_range)
+ grid = explore_SVC(Xbt, Ybt, n_folds=5, n_jobs=int(opt['svc_n_jobs']),
+ **kwargs)
import pickle
- pkl = open('grid.pkl', 'w')
+ krnlstr = '_'.join(s for s in opt['svc_kernel_range'].split(',') if s)
+ pkl = open('grid%s.pkl' % krnlstr, 'w')
pickle.dump(grid, pkl)
pkl.close()
+# pkl = open('grid.pkl', 'r')
+# grid = pickle.load(pkl)
+# pkl.close()
plot_grid(grid, save=opt['svc_img'])
# test the accuracy of different classifiers
@@ -628,7 +667,7 @@
# test different classifiers
msgr.message("Exploring different classifiers.")
msgr.message("cls_id cls_name mean max min std")
- #import ipdb; ipdb.set_trace()
+
res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels,
indexes=indexes, n_folds=5,
bv=flg['v'], extra=flg['x'])
@@ -639,27 +678,43 @@
if flg['c']:
# classify
- cols = []
data = np.load(opt['npy_data'])
- pprint(rules_vals)
- # Substitute (skip cat column)
- data = substitute(data, rules_vals, cols[1:])
+ indx = np.load(opt['npy_index'])
- msgr.message("Scaling the whole data set.")
- data = scaler.transform(data) if scaler else data
+ # Substitute using column values
+ data, dummy = substitute(data, rules, cols[1:])
+ Xt = data[indx]
+
+ if scaler:
+ msgr.message("Scaling the training data set.")
+ scaler.fit(Xt, Yt)
+ Xt = scaler.transform(Xt)
+ msgr.message("Scaling the whole data set.")
+ data = scaler.transform(data)
+ if decmp:
+ msgr.message("Decomposing the training data set.")
+ decmp.fit(Xt)
+ Xt = decmp.transform(Xt)
+ msgr.message("Decompose the whole data set.")
+ data = decmp.transform(data)
cats = np.load(opt['npy_cats'])
+ np.save('data_filled_scaled.npy', data)
+ tcols = []
for cls in classifiers:
- run_classifier(cls, Xbt, Ybt, Xt, Yt, labels, data,
- save=opt['report_class'])
- cols.append((cls['name'], 'INTEGER'))
+ report = (open(opt['report_class'], "w")
+ if opt['report_class'] else sys.stdout)
+ run_classifier(cls, Xt, Yt, Xt, Yt, labels, data,
+ report=report)
+ tcols.append((cls['name'], 'INTEGER'))
-# import pickle
-# res = open('res.pkl', 'r')
-# classifiers = pickle.load(res)
+ import pickle
+ with open('classification_results.pkl', 'w') as res:
+ pickle.dump(classifiers, res)
+ #classifiers = pickle.load(res)
msgr.message("Export the results to layer: <%s>" % str(rlayer))
- export_results(vect, classifiers, cats, rlayer, vtraining, cols,
- overwrite(), pkl='res.pkl')
+ export_results(vect, classifiers, cats, rlayer, vtraining, tcols,
+ overwrite(), pkl='res.pkl', append=flg['a'])
# res.close()
if flg['r']:
@@ -673,15 +728,15 @@
rasters = [c for c in tab.columns]
rasters.remove(tab.key)
- import ipdb; ipdb.set_trace()
v2rst = Module('v.to.rast')
rclrs = Module('r.colors')
for rst in rasters:
v2rst(input=vect, layer=rlayer, type='area',
- use='attr', attrcolumn=rst, output=opt['rst_names'] % rst,
- rows=4096 * 4, overwrite=overwrite())
+ use='attr', attrcolumn=rst.encode(),
+ output=(opt['rst_names'] % rst).encode(),
+ memory=1000, overwrite=overwrite())
if rules:
- rclrs(map=rst, rules='-', stdin_=rules)
+ rclrs(map=rst.encode(), rules='-', stdin_=rules)
if __name__ == "__main__":
More information about the grass-commit
mailing list