[GRASS-SVN] r60476 - grass-addons/grass7/vector/v.class.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Sun May 25 12:07:24 PDT 2014
Author: zarch
Date: 2014-05-25 12:07:24 -0700 (Sun, 25 May 2014)
New Revision: 60476
Added:
grass-addons/grass7/vector/v.class.ml/features.py
Modified:
grass-addons/grass7/vector/v.class.ml/Makefile
grass-addons/grass7/vector/v.class.ml/ml_classifiers.py
grass-addons/grass7/vector/v.class.ml/ml_functions.py
grass-addons/grass7/vector/v.class.ml/sqlite2npy.py
grass-addons/grass7/vector/v.class.ml/training_extraction.py
grass-addons/grass7/vector/v.class.ml/v.class.ml.py
Log:
Add new classifiers and new options
Modified: grass-addons/grass7/vector/v.class.ml/Makefile
===================================================================
--- grass-addons/grass7/vector/v.class.ml/Makefile 2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/Makefile 2014-05-25 19:07:24 UTC (rev 60476)
@@ -2,7 +2,8 @@
PGM = v.class.ml
-ETCFILES = training_extraction ml_classifiers ml_functions sqlite2npy npy2table
+ETCFILES = training_extraction ml_classifiers ml_functions \
+ sqlite2npy npy2table features
include $(MODULE_TOPDIR)/include/Make/Script.make
include $(MODULE_TOPDIR)/include/Make/Python.make
Added: grass-addons/grass7/vector/v.class.ml/features.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/features.py (rev 0)
+++ grass-addons/grass7/vector/v.class.ml/features.py 2014-05-25 19:07:24 UTC (rev 60476)
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# -- coding: utf-8 --
+from __future__ import (absolute_import, division, print_function)
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.ensemble import ExtraTreesClassifier
+
+
+N_ESTIMATORS = 500
+RANDOM_STATE = 0
+
+
+IMP_DTYPE = [('col', '<U24'), ('imp', 'f'), ('std', 'f')]
+
+
+def tocsv(array, sep=';', fmt='%s'):
+ return '\n'.join([sep.join([fmt % el for el in row]) for row in array])
+
+
+def importances(X, y, cols, csv='', img='',
+ clf=ExtraTreesClassifier(n_estimators=N_ESTIMATORS,
+ random_state=RANDOM_STATE),
+ **savefig):
+ clf.fit(X, y)
+ imp = clf.feature_importances_
+ std = np.std([est.feature_importances_ for est in clf.estimators_], axis=0)
+ res = np.array([(c, i, s) for c, i, s in zip(cols, imp, std)],
+ dtype=IMP_DTYPE)
+ res.sort(order='imp')
+ res = res[::-1]
+ if csv:
+ with open(csv, 'w') as csv:
+ csv.write(tocsv(res))
+ if img:
+ fig, ax = plt.subplots(figsize=(5, 40))
+ pos = range(len(cols))
+ ax.barh(pos, res['imp'] * 100, align='center', alpha=0.4)
+ ax.set_yticks(pos)
+ ax.set_yticklabels(res['col'])
+ ax.set_xlabel('Importance [%]')
+ ax.set_title("Feature importances")
+ ax.grid()
+ fig.savefig(img, **savefig)
+ return res
Modified: grass-addons/grass7/vector/v.class.ml/ml_classifiers.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_classifiers.py 2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/ml_classifiers.py 2014-05-25 19:07:24 UTC (rev 60476)
@@ -9,13 +9,18 @@
from gettext import lgettext as _
from sklearn.linear_model import SGDClassifier
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import (AdaBoostClassifier,
+ ExtraTreesClassifier,
+ GradientBoostingClassifier,
+ RandomForestClassifier,
+ RandomTreesEmbedding)
from sklearn.neighbors import (NearestNeighbors,
KNeighborsClassifier,
RadiusNeighborsClassifier,
NearestCentroid)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
+from sklearn import metrics
from grass.pygrass.messages import Messenger
@@ -85,44 +90,7 @@
'kwargs': {'n_neighbors': 8, 'weights': 'distance'}},
{'name': 'knn16_distance', 'classifier': KNeighborsClassifier,
'kwargs': {'n_neighbors': 16, 'weights': 'distance'}},
- # radius
- {'name': 'knn_radius_0p5_uniform',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 0.5, 'weights': 'uniform'}},
- {'name': 'knn_radius_1_uniform',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 1., 'weights': 'uniform'}},
- {'name': 'knn_radius_1p5_uniform',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 1.5, 'weights': 'uniform'}},
- {'name': 'knn_radius_2_uniform',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 2., 'weights': 'uniform'}},
- {'name': 'knn_radius_2p5_uniform',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 2.5, 'weights': 'uniform'}},
- {'name': 'knn_radius_5_uniform',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 5., 'weights': 'uniform'}},
- {'name': 'knn_radius_0p5_distance',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 0.5, 'weights': 'distance'}},
- {'name': 'knn_radius_1_distance',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 1., 'weights': 'distance'}},
- {'name': 'knn_radius_1p5_distance',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 1.5, 'weights': 'distance'}},
- {'name': 'knn_radius_2_distance',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 2., 'weights': 'distance'}},
- {'name': 'knn_radius_2p5_distance',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 2.5, 'weights': 'distance'}},
- {'name': 'knn_radius_5_distance',
- 'classifier': RadiusNeighborsClassifier,
- 'kwargs': {'radius': 5., 'weights': 'distance'}},
# centroid
# ‘euclidean’, ‘l2’, ‘l1’, ‘manhattan’, ‘cityblock’
# [‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘cosine’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]
@@ -161,25 +129,25 @@
{'name': 'knn_centroid_manhattan_none', 'classifier': NearestCentroid,
'kwargs': {'metric': 'manhattan', 'shrink_threshold ': None}},
- {'name': 'knn_centroid_manhattan_0p5', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 0.5}},
- {'name': 'knn_centroid_manhattan_1', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.0}},
- {'name': 'knn_centroid_manhattan_1p5', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.5}},
- {'name': 'knn_centroid_manhattan_2', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 2.0}},
+# {'name': 'knn_centroid_manhattan_0p5', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 0.5}},
+# {'name': 'knn_centroid_manhattan_1', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.0}},
+# {'name': 'knn_centroid_manhattan_1p5', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.5}},
+# {'name': 'knn_centroid_manhattan_2', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 2.0}},
{'name': 'knn_centroid_cityblock_none', 'classifier': NearestCentroid,
'kwargs': {'metric': 'cityblock', 'shrink_threshold ': None}},
- {'name': 'knn_centroid_cityblock_0p5', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 0.5}},
- {'name': 'knn_centroid_cityblock_1', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.0}},
- {'name': 'knn_centroid_cityblock_1p5', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.5}},
- {'name': 'knn_centroid_cityblock_2', 'classifier': NearestCentroid,
- 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 2.0}},
+# {'name': 'knn_centroid_cityblock_0p5', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 0.5}},
+# {'name': 'knn_centroid_cityblock_1', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.0}},
+# {'name': 'knn_centroid_cityblock_1p5', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.5}},
+# {'name': 'knn_centroid_cityblock_2', 'classifier': NearestCentroid,
+# 'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 2.0}},
#
# Tree
#
@@ -192,12 +160,12 @@
'kwargs': {'criterion': 'gini', 'max_depth': 'sqrt'}},
{'name': 'd_tree_gini_log2', 'classifier': DecisionTreeClassifier,
'kwargs': {'criterion': 'gini', 'max_depth': 'log2'}},
- {'name': 'd_tree_gini_0p25', 'classifier': DecisionTreeClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 0.25}},
- {'name': 'd_tree_gini_0p50', 'classifier': DecisionTreeClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 0.5}},
- {'name': 'd_tree_gini_0p75', 'classifier': DecisionTreeClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 0.75}},
+# {'name': 'd_tree_gini_0p25', 'classifier': DecisionTreeClassifier,
+# 'kwargs': {'criterion': 'gini', 'max_depth': 0.25}},
+# {'name': 'd_tree_gini_0p50', 'classifier': DecisionTreeClassifier,
+# 'kwargs': {'criterion': 'gini', 'max_depth': 0.5}},
+# {'name': 'd_tree_gini_0p75', 'classifier': DecisionTreeClassifier,
+# 'kwargs': {'criterion': 'gini', 'max_depth': 0.75}},
{'name': 'd_tree_entropy', 'classifier': DecisionTreeClassifier,
'kwargs': {'criterion': 'entropy', 'splitter': 'best', 'max_depth': None,
@@ -208,46 +176,156 @@
'kwargs': {'criterion': 'entropy', 'max_depth': 'sqrt'}},
{'name': 'd_tree_entropy_log2', 'classifier': DecisionTreeClassifier,
'kwargs': {'criterion': 'entropy', 'max_depth': 'log2'}},
- {'name': 'd_tree_entropy_0p25', 'classifier': DecisionTreeClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 0.25}},
- {'name': 'd_tree_entropy_0p50', 'classifier': DecisionTreeClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 0.5}},
- {'name': 'd_tree_entropy_0p75', 'classifier': DecisionTreeClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 0.75}},
+# {'name': 'd_tree_entropy_0p25', 'classifier': DecisionTreeClassifier,
+# 'kwargs': {'criterion': 'entropy', 'max_depth': 0.25}},
+# {'name': 'd_tree_entropy_0p50', 'classifier': DecisionTreeClassifier,
+# 'kwargs': {'criterion': 'entropy', 'max_depth': 0.5}},
+# {'name': 'd_tree_entropy_0p75', 'classifier': DecisionTreeClassifier,
+# 'kwargs': {'criterion': 'entropy', 'max_depth': 0.75}},
+ #
+ # Forest
+ #
{'name': 'rand_tree_gini', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'gini', 'splitter': 'best', 'max_depth': None,
+ 'kwargs': {'criterion': 'gini', 'max_depth': None,
'min_samples_split': 2, 'min_samples_leaf': 1,
'max_features': None, 'random_state': None,
'min_density': None}},
{'name': 'rand_tree_gini_sqrt', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 'sqrt'}},
+ 'kwargs': {'criterion': 'gini', 'max_depth': None,
+ 'min_samples_split': 2, 'min_samples_leaf': 1,
+ 'max_features': 'sqrt', 'random_state': None,
+ 'min_density': None}},
{'name': 'rand_tree_gini_log2', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 'log2'}},
+ 'kwargs': {'criterion': 'gini', 'max_depth': None,
+ 'min_samples_split': 2, 'min_samples_leaf': 1,
+ 'max_features': 'log2', 'random_state': None,
+ 'min_density': None}},
+ {'name': 'rand_tree_gini_0p05', 'classifier': RandomForestClassifier,
+ 'kwargs': {'criterion': 'gini', 'max_features': 0.05}},
{'name': 'rand_tree_gini_0p25', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 0.25}},
+ 'kwargs': {'criterion': 'gini', 'max_features': 0.25}},
{'name': 'rand_tree_gini_0p50', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 0.5}},
+ 'kwargs': {'criterion': 'gini', 'max_features': 0.5}},
{'name': 'rand_tree_gini_0p75', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'gini', 'max_depth': 0.75}},
+ 'kwargs': {'criterion': 'gini', 'max_features': 0.75}},
{'name': 'rand_tree_entropy', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'entropy', 'splitter': 'best', 'max_depth': None,
+ 'kwargs': {'criterion': 'entropy', 'max_depth': None,
'min_samples_split': 2, 'min_samples_leaf': 1,
'max_features': None, 'random_state': None,
'min_density': None}},
{'name': 'rand_tree_entropy_sqrt', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 'sqrt'}},
+ 'kwargs': {'criterion': 'entropy', 'max_depth': None,
+ 'min_samples_split': 2, 'min_samples_leaf': 1,
+ 'max_features': 'sqrt', 'random_state': None,
+ 'min_density': None}},
{'name': 'rand_tree_entropy_log2', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 'log2'}},
+ 'kwargs': {'criterion': 'entropy', 'max_depth': None,
+ 'min_samples_split': 2, 'min_samples_leaf': 1,
+ 'max_features': 'log2', 'random_state': None,
+ 'min_density': None}},
{'name': 'rand_tree_entropy_0p25', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 0.25}},
+ 'kwargs': {'criterion': 'entropy', 'max_features': 0.25}},
{'name': 'rand_tree_entropy_0p50', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 0.5}},
+ 'kwargs': {'criterion': 'entropy', 'max_features': 0.5}},
{'name': 'rand_tree_entropy_0p75', 'classifier': RandomForestClassifier,
- 'kwargs': {'criterion': 'entropy', 'max_depth': 0.75}},
+ 'kwargs': {'criterion': 'entropy', 'max_features': 0.75}},
+# # RandomTreesEmbedding
+# {'name': 'rand_tree_emb_10_5', 'classifier': RandomTreesEmbedding,
+# 'kwargs': dict(n_estimators=10, max_depth=5, min_samples_split=2,
+# min_samples_leaf=1, n_jobs=-1, random_state=None, verbose=0,
+# min_density=None)},
+# {'name': 'rand_tree_emb_10_5_leaf3', 'classifier': RandomTreesEmbedding,
+# 'kwargs': dict(n_estimators=10, max_depth=5, min_samples_split=2,
+# min_samples_leaf=3, n_jobs=1, random_state=None, verbose=0,
+# min_density=None)},
+# {'name': 'rand_tree_emb_10_50', 'classifier': RandomTreesEmbedding,
+# 'kwargs': dict(n_estimators=10, max_depth=50, min_samples_split=2,
+# min_samples_leaf=1, n_jobs=1, random_state=None, verbose=0,
+# min_density=None)},
+# {'name': 'rand_tree_emb_100_50', 'classifier': RandomTreesEmbedding,
+# 'kwargs': dict(n_estimators=100, max_depth=50, min_samples_split=2,
+# min_samples_leaf=1, n_jobs=1, random_state=None, verbose=0,
+# min_density=None)},
+# {'name': 'rand_tree_emb_100_50', 'classifier': RandomTreesEmbedding,
+# 'kwargs': dict(n_estimators=100, max_depth=50, min_samples_split=2,
+# min_samples_leaf=3, n_jobs=1, random_state=None, verbose=0,
+# min_density=None)},
+
#
+ # AdaBoost classifier
+ #
+ {'name': 'ada_50_1.0', 'classifier': AdaBoostClassifier,
+ 'kwargs': dict(base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',
+ max_depth=1, max_features=None, min_density=None,
+ min_samples_leaf=1, min_samples_split=2, random_state=None,
+ splitter='best'),
+ n_estimators=50, learning_rate=1.0,
+ algorithm='SAMME.R', random_state=None)},
+ {'name': 'ada_50_1.0_minleaf3', 'classifier': AdaBoostClassifier,
+ 'kwargs': dict(base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',
+ max_depth=1, max_features=None, min_density=None,
+ min_samples_leaf=3, min_samples_split=2, random_state=None,
+ splitter='best'),
+ n_estimators=50, learning_rate=1.0,
+ algorithm='SAMME.R', random_state=None)},
+ {'name': 'ada_50_0.5', 'classifier': AdaBoostClassifier,
+ 'kwargs': dict(base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',
+ max_depth=1, max_features=None, min_density=None,
+ min_samples_leaf=1, min_samples_split=2, random_state=None,
+ splitter='best'),
+ n_estimators=50, learning_rate=0.5,
+ algorithm='SAMME.R', random_state=None)},
+
+ {'name': 'extra_tree_10_1', 'classifier': ExtraTreesClassifier,
+ 'kwargs': dict(n_estimators=10, criterion='gini', max_depth=None,
+ min_samples_split=2, min_samples_leaf=1,
+ max_features='auto', bootstrap=False, oob_score=False,
+ n_jobs=1, random_state=None, verbose=0, min_density=None,
+ compute_importances=None)},
+ {'name': 'extra_tree_10_3', 'classifier': ExtraTreesClassifier,
+ 'kwargs': dict(n_estimators=10, criterion='gini', max_depth=None,
+ min_samples_split=2, min_samples_leaf=3,
+ max_features='auto', bootstrap=False, oob_score=False,
+ n_jobs=1, random_state=None, verbose=0, min_density=None,
+ compute_importances=None)},
+ {'name': 'extra_tree_100_1', 'classifier': ExtraTreesClassifier,
+ 'kwargs': dict(n_estimators=100, criterion='gini', max_depth=None,
+ min_samples_split=2, min_samples_leaf=1,
+ max_features='auto', bootstrap=False, oob_score=False,
+ n_jobs=1, random_state=None, verbose=0, min_density=None,
+ compute_importances=None)},
+ {'name': 'extra_tree_100_3', 'classifier': ExtraTreesClassifier,
+ 'kwargs': dict(n_estimators=100, criterion='gini', max_depth=None,
+ min_samples_split=2, min_samples_leaf=3,
+ max_features='auto', bootstrap=False, oob_score=False,
+ n_jobs=1, random_state=None, verbose=0, min_density=None,
+ compute_importances=None)},
+ {'name': 'extra_tree_100_5', 'classifier': ExtraTreesClassifier,
+ 'kwargs': dict(n_estimators=100, criterion='gini', max_depth=None,
+ min_samples_split=2, min_samples_leaf=5,
+ max_features='auto', bootstrap=False, oob_score=False,
+ n_jobs=1, random_state=None, verbose=0, min_density=None,
+ compute_importances=None)},
+ {'name': 'gradient_boost_100_minleaf1', 'classifier': GradientBoostingClassifier,
+ 'kwargs': dict(loss='deviance', learning_rate=0.1, n_estimators=100,
+ subsample=1.0, min_samples_split=2, min_samples_leaf=1,
+ max_depth=3, init=None, random_state=None,
+ max_features=None, verbose=0)},
+ {'name': 'gradient_boost_100_meanleaf3', 'classifier': GradientBoostingClassifier,
+ 'kwargs': dict(loss='deviance', learning_rate=0.1, n_estimators=100,
+ subsample=1.0, min_samples_split=2, min_samples_leaf=3,
+ max_depth=3, init=None, random_state=None,
+ max_features=None, verbose=0)},
+ {'name': 'gradient_boost_100_meanleaf5', 'classifier': GradientBoostingClassifier,
+ 'kwargs': dict(loss='deviance', learning_rate=0.1, n_estimators=100,
+ subsample=1.0, min_samples_split=2, min_samples_leaf=5,
+ max_depth=3, init=None, random_state=None,
+ max_features=None, verbose=0)},
+
+ #
# Gausian
#
{'name': 'gaussianNB', 'classifier': GaussianNB},
@@ -390,4 +468,4 @@
# 'classifier': MLPYWrapper(mlpy.MaximumLikelihoodC)},
]
# add MLPY
- CLASSIFIERS.extend(MLPY_CLS)
+ #CLASSIFIERS.extend(MLPY_CLS)
Modified: grass-addons/grass7/vector/v.class.ml/ml_functions.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_functions.py 2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/ml_functions.py 2014-05-25 19:07:24 UTC (rev 60476)
@@ -4,32 +4,40 @@
@author: pietro
"""
-from __future__ import (absolute_import, division, print_function,
- unicode_literals)
+from __future__ import (absolute_import, division, print_function)
import time
import random as rnd
from gettext import lgettext as _
import sys
+import pickle as pk
import numpy as np
-import pylab as pl
+import matplotlib.pyplot as plt
-from sklearn.metrics import accuracy_score
+from sklearn import metrics as metrics
+from sklearn.metrics import precision_recall_curve as prc, roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
+from sklearn.cross_validation import cross_val_score
-from grass.pygrass.messages import Messenger
+#from grass.pygrass.messages import get_msgr
-MSGR = Messenger()
-
COLS = [('cat', 'INTEGER PRIMARY KEY'),
('class', 'INTEGER'),
('color', 'VARCHAR(11)'), ]
+SCORES_DTYPE = [('index', 'i'),
+ ('name', '<U32'),
+ ('mean', 'f'),
+ ('max', 'f'),
+ ('min', 'f'),
+ ('std', 'f')]
+
+
def print_cols(clss, sep=';', save=sys.stdout):
clsses = sorted(set(clss))
cols = ['ml_index', 'ml_name', 'fit_time', 'prediction_time',
@@ -50,26 +58,29 @@
print(sep.join(res), file=save)
-def accuracy(sol, cls=None, data=None, clss=None, pred=None):
+def accuracy(sol, cls=None, data=None, labels=None, pred=None):
cls = cls if cls else dict()
- clsses = clss if clss else sorted(set(sol))
+ clsses = sorted(labels.keys())
if 'cls' in cls:
cls['pred_start'] = time.time()
pred = cls['cls'].predict(data)
cls['pred_stop'] = time.time()
- cls['t_acc'] = accuracy_score(sol, pred, normalize=True)
+ cls['t_acc'] = metrics.accuracy_score(sol, pred, normalize=True)
+ lab = [labels[key] for key in clsses]
+ cls['report'] = metrics.classification_report(sol, pred, lab)
+ cls['confusion'] = metrics.confusion_matrix(sol, pred, lab)
c_acc = []
for c in clsses:
- indx = sol == c
- c_acc.append(accuracy_score(sol[indx], pred[indx],
- normalize=True))
+ indx = (sol == c).nonzero()
+ c_acc.append(metrics.accuracy_score(sol[indx], pred[indx],
+ normalize=True))
cls['c_acc'] = np.array(c_acc)
cls['c_acc_mean'] = cls['c_acc'].mean()
return cls
-def test_classifier(cls, Xt, Yt, Xd, Yd, clss, save=sys.stdout,
+def test_classifier(cls, Xt, Yt, Xd, Yd, labels, save=sys.stdout,
verbose=True):
cls['cls'] = cls['classifier'](**cls.get('kwargs', {}))
cls['fit_start'] = time.time()
@@ -79,18 +90,20 @@
cls['params'] = cls['cls'].get_params()
except AttributeError:
cls['params'] = None
- accuracy(Yd, cls, Xd, clss)
+ accuracy(Yd, cls, Xd, labels)
if verbose:
print_test(cls, save=save)
-def run_classifier(cls, Xt, Yt, Xd, Yd, clss, data,
- save=sys.stdout):
- test_classifier(cls, Xt, Yt, Xd, Yd, clss, verbose=False)
+def run_classifier(cls, Xt, Yt, Xd, Yd, labels, data,
+ report=sys.stdout):
+ test_classifier(cls, Xt, Yt, Xd, Yd, labels, verbose=False)
cls['pred_start'] = time.time()
cls['predict'] = cls['cls'].predict(data)
cls['pred_stop'] = time.time()
- print_test(cls, save=save)
+ print_test(cls, save=report)
+ report.write('\n' + cls['report'])
+ report.write('\n' + cls['confusion'])
np.save(cls['name'] + '.npy', cls['predict'])
@@ -119,13 +132,14 @@
return bdata, bclss
-def optimize_training(cls, tdata, tclss,
+def optimize_training(cls, tdata, tclss, labels,
scaler=None, num=None, maxiterations=1000):
best = cls.copy()
best['c_acc_mean'] = 0
means = []
+ #msgr = get_msgr()
for i in range(maxiterations): # TODO: use multicore
- MSGR.percent(i, maxiterations, 1)
+ #msgr.percent(i, maxiterations, 1)
Xt, Yt = balance(tdata, tclss, num)
if scaler:
scaler.fit(Xt, Yt)
@@ -133,7 +147,7 @@
stdata = scaler.transform(tdata)
else:
sXt, stdata = Xt, tdata
- test_classifier(cls, sXt, Yt, stdata, tclss, None, verbose=False)
+ test_classifier(cls, sXt, Yt, stdata, tclss, labels, verbose=False)
if cls['c_acc_mean'] > best['c_acc_mean']:
print("%f > %f" % (cls['c_acc_mean'], best['c_acc_mean']))
best = cls.copy()
@@ -149,7 +163,132 @@
return best, bXt, bYt
-def explorer_clsfiers(clsses, Xt, Yt, Xd, Yd, clss,
+def plot_bias_variance(data_sizes, train_errors, test_errors, name,
+ title="Bias-Variance for '%s'",
+ train_err_std=None, test_err_std=None,
+ train_stl='-', test_stl='-',
+ train_width=1, test_width=1,
+ train_clr='b', test_clr='r', alpha=0.2,
+ fmt='png', **kwargs):
+ fig, ax = plt.subplots(figsize=(6, 5))
+ ax.set_ylim([0.0, 1.0])
+ ax.set_xlabel('Data set size')
+ ax.set_ylabel('Error')
+ ax.set_title(title % name)
+ if train_err_std is not None:
+ ax.fill_between(data_sizes,
+ train_errors - train_err_std,
+ train_errors + train_err_std,
+ facecolor=train_clr, alpha=alpha)
+ if test_err_std is not None:
+ ax.fill_between(data_sizes,
+ test_errors - test_err_std,
+ test_errors + test_err_std,
+ facecolor=test_clr, alpha=alpha)
+ ax.plot(data_sizes, test_errors, label="test error", color=test_clr,
+ linestyle=test_stl, linewidth=test_width)
+ ax.plot(data_sizes, train_errors, label="train error", color=train_clr,
+ linestyle=train_stl, linewidth=train_width)
+ ax.legend(loc="upper right")
+ ax.grid(True, linestyle='-', color='0.75')
+ fig.savefig("bv_%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+
+
+def bias_variance_analysis(cls, tdata, tclss, n_folds=5, step=5):
+ clss = sorted(set(tclss))
+ num = min([len(tclss[tclss == c]) for c in clss])
+
+ clf = cls['classifier'](**cls['kwargs'])
+ keys = ('fprs', 'tprs', 'roc_scores', 'pr_scores', 'precisions',
+ 'recalls', 'thresholds')
+
+ bv = {}
+ lk = {l: {k: [] for k in keys} for l in clss}
+ for n in range(5, num, step):
+ X, y = balance(tdata, tclss, n)
+ cv = StratifiedKFold(y, n_folds=n_folds)
+ # instantiate empty lists
+ train_errors, test_errors, scores = [], [], []
+ for train, test in cv:
+ X_train, y_train = X[train], y[train]
+ X_test, y_test = X[test], y[test]
+
+ # fit train data
+ clf.fit(X_train, y_train)
+
+ # get score
+ train_score = clf.score(X_train, y_train)
+ test_score = clf.score(X_test, y_test)
+ scores.append(test_score)
+
+ # get errors
+ train_errors.append(1 - train_score)
+ test_errors.append(1 - test_score)
+
+ # get probability
+ proba = clf.predict_proba(X_test)
+
+ # compute score for each class VS rest
+ for idx, label in enumerate(clss):
+ fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
+ precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
+ lk[label]['fprs'].append(fpr)
+ lk[label]['tprs'].append(tpr)
+ lk[label]['roc_scores'].append(auc(fpr, tpr))
+
+ lk[label]['precisions'].append(precision)
+ lk[label]['recalls'].append(recall)
+ lk[label]['thresholds'].append(pr_thr)
+ lk[label]['pr_scores'].append(auc(recall, precision))
+ bv[n] = {'test': np.array(test_errors),
+ 'train': np.array(train_errors),
+ 'score': np.array(scores)}
+ cls['bias variance'] = bv
+ cls['label scores'] = lk
+
+
+def explorer_clsfiers(clsses, Xd, Yd, indexes=None, n_folds=5, bv=False):
+ gen = zip(indexes, clsses) if indexes else enumerate(clsses)
+ cv = StratifiedKFold(Yd, n_folds=n_folds)
+ fmt = '%5d %-30s %6.4f %6.4f %6.4f %6.4f'
+ res = []
+ kw = dict(bbox_inches="tight", dpi=300)
+ for index, cls in gen:
+ try:
+ cls['scores'] = cross_val_score(cls['classifier'](**cls['kwargs']),
+ Xd, Yd, cv=cv, n_jobs=1, verbose=0)
+ # TODO: if n_jobs == -1 raise:
+ # AttributeError: '_MainProcess' object has no attribute '_daemonic'
+ mean, mx, mn, st = (cls['scores'].mean(), cls['scores'].max(),
+ cls['scores'].min(), cls['scores'].std())
+ vals = (index, cls['name'], mean, mx, mn, st)
+ print(fmt % vals)
+ res.append(vals)
+ if bv:
+ bias_variance_analysis(cls, Xd, Yd, n_folds=5, step=5)
+ bv = cls['bias variance']
+ data_sizes = np.array(sorted(bv.keys()))
+ test = np.array([bv[i]['test'] for i in data_sizes])
+ train = np.array([bv[i]['train'] for i in data_sizes])
+ plot_bias_variance(data_sizes,
+ train.mean(axis=1), test.mean(axis=1),
+ cls['name'], "Bias-Variance for '%s'",
+ train_err_std=train.std(axis=1),
+ test_err_std=test.std(axis=1),
+ train_stl='-', test_stl='-',
+ train_width=1, test_width=1,
+ train_clr='b', test_clr='r', alpha=0.2,
+ fmt='png', **kw)
+ with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
+ pk.dump(cls, pkl)
+ except:
+ #import ipdb; ipdb.set_trace()
+ #print('problem with: %s' % cls['name'])
+ pass
+ return np.array(res, dtype=SCORES_DTYPE)
+
+
+def explorer_clsfiers_old(clsses, Xt, Yt, Xd, Yd, clss,
indexes=None, csv=sys.stdout):
errors = []
gen = zip(indexes, clsses) if indexes else enumerate(clsses)
@@ -165,37 +304,46 @@
print('Error in: %s' % err['name'])
+CMAP = plt.cm.Blues
+
+
def plot_grid(grid, save=''):
C = grid.param_grid['C']
gamma = grid.param_grid['gamma']
-
- for kernel in grid.param_grid['kernel']:
+ kernels = grid.param_grid['kernel']
+ for kernel in kernels:
scores = [x[1] for x in grid.grid_scores_ if x[0]['kernel'] == kernel]
scores = np.array(scores).reshape(len(C), len(gamma))
# draw heatmap of accuracy as a function of gamma and C
- pl.figure(figsize=(8, 6))
- pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
- pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
- pl.xlabel(r'$\gamma$')
- pl.ylabel('C')
- pl.colorbar()
- pl.xticks(np.arange(len(gamma)), gamma, rotation=45)
- pl.yticks(np.arange(len(C)), C)
+ #pl.figure(figsize=(8, 6))
+ #pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
+ fig, ax = plt.subplots()
+ img = ax.imshow(scores, interpolation='nearest', cmap=CMAP)
+ ax.set_xlabel(r'$\gamma$')
+ ax.set_ylabel('C')
+ ax.set_xticks(np.arange(len(gamma)))
+ ax.set_xticklabels(gamma, rotation=45)
+ ax.set_yticks(np.arange(len(C)))
+ ax.set_yticklabels(C)
+# if kernel == 'poly':
+# import ipdb; ipdb.set_trace()
ic, igamma = np.unravel_index(np.argmax(scores), scores.shape)
- pl.plot(igamma, ic, 'r.')
- best = scores[igamma, ic]
- titl = r"$best:\, %0.4f, \,C:\, %g, \,\gamma: \,%g$" % (best,
- C[ic],
- gamma[igamma])
- pl.title(titl)
+ ax.plot(igamma, ic, 'r.')
+ best = scores[ic, igamma]
+ titl = r"%s $best:\, %0.4f, \,C:\, %g, \,\gamma: \,%g$" % (kernel.title(),
+ best, C[ic],
+ gamma[igamma])
+ ax.set_title(titl)
+ fig.colorbar(img)
if save:
- pl.savefig(save, dpi=600, trasparent=True, bbox_inches='tight')
- pl.show()
+ fig.savefig(save % kernel, dpi=600, trasparent=True, bbox_inches='tight')
+ fig.show()
def explore_SVC(Xt, Yt, n_folds=3, n_jobs=1, **kwargs):
cv = StratifiedKFold(y=Yt, n_folds=n_folds)
- grid = GridSearchCV(SVC(), param_grid=kwargs, cv=cv, n_jobs=n_jobs)
+ grid = GridSearchCV(SVC(), param_grid=kwargs, cv=cv, n_jobs=n_jobs,
+ verbose=2)
grid.fit(Xt, Yt)
print("The best classifier is: ", grid.best_estimator_)
return grid
Modified: grass-addons/grass7/vector/v.class.ml/sqlite2npy.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/sqlite2npy.py 2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/sqlite2npy.py 2014-05-25 19:07:24 UTC (rev 60476)
@@ -10,6 +10,7 @@
from grass.pygrass.vector import VectorTopo
FCATS = 'cats.npy'
+FCOLS = 'cols.npy'
FDATA = 'data.npy'
FINDX = 'indx.npy'
FCLSS = 'training_classes.npy'
@@ -32,22 +33,23 @@
return dt
-def save2npy(vect, l_data, l_trning,
- fcats=FCATS, fdata=FDATA, findx=FINDX,
+def save2npy(vect, l_data, l_trn,
+ fcats=FCATS, fcols=FCOLS, fdata=FDATA, findx=FINDX,
fclss=FCLSS, ftdata=FTDATA):
- """Return 5 arrays:
+ """Return 6 arrays:
- categories,
+ - columns name,
- data,
- a boolean array with the training,
- - the training classes
- - the training data
+ - the training classes,
+ - the training data.
"""
with VectorTopo(vect, mode='r') as vct:
# instantiate the tables
- data = (vct.dblinks.by_layer(l_data).table() if l_data.isdigit()
+ data = (vct.dblinks.by_layer(int(l_data)).table() if l_data.isdigit()
else vct.dblinks.by_name(l_data).table())
- trng = (vct.dblinks.by_layer(l_trning).table() if l_trning.isdigit()
- else vct.dblinks.by_name(l_trning).table())
+ trng = (vct.dblinks.by_layer(int(l_trn)).table() if l_trn.isdigit()
+ else vct.dblinks.by_name(l_trn).table())
# check the dimensions
n_trng, n_data = trng.n_rows(), data.n_rows()
@@ -68,9 +70,10 @@
# extract the data
data_cols = data.columns.names()
+ cols = np.array(data_cols)
data_cols.remove(data.key)
- cols = ', '.join(data_cols)
- slct_data = "SELECT {cols} FROM {tname};".format(cols=cols,
+ scols = ', '.join(data_cols)
+ slct_data = "SELECT {cols} FROM {tname};".format(cols=scols,
tname=data.name)
shape = (n_data, len(data_cols))
# use the function to be more memory efficient
@@ -89,11 +92,12 @@
# save
np.save(fcats, cats)
+ np.save(fcols, cols)
np.save(fdata, dta)
np.save(findx, trn_indxs)
np.save(fclss, trn_ind)
np.save(ftdata, trn_dta)
- return cats, dta, trn_indxs, trn_ind, trn_dta
+ return cats, cols, dta, trn_indxs, trn_ind, trn_dta
def load_from_npy(fcats=FCATS, fdata=FDATA, findx=FINDX,
Modified: grass-addons/grass7/vector/v.class.ml/training_extraction.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/training_extraction.py 2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/training_extraction.py 2014-05-25 19:07:24 UTC (rev 60476)
@@ -13,9 +13,9 @@
from grass.script.core import overwrite
from grass.pygrass.vector import VectorTopo, Vector
from grass.pygrass.vector.table import Link, Table
-from grass.pygrass.vector.geometry import Area, intersects
+from grass.pygrass.vector.geometry import Line, Area, intersects
from grass.pygrass.vector.basic import Bbox, BoxList
-from grass.pygrass.messages import Messenger
+from grass.pygrass.messages import get_msgr
COLS = [('cat', 'INTEGER PRIMARY KEY'),
@@ -29,9 +29,10 @@
"""
to_up = []
bbox = Bbox()
+ aline = Line()
for area in alist:
bbox = area.bbox(bbox)
- if ((intersects(area.boundary, line)) or
+ if ((intersects(area.get_points(aline), line)) or
(area.contain_pnt(line[0], bbox))):
to_up.append((line.cat, area.cat))
if (cur is not None) and (sql is not None):
@@ -45,20 +46,25 @@
"""
to_up = []
bbox = trn_area.bbox()
+ aline = Line()
+ tline = Line()
for s_id in ids:
seg_area.id = s_id
seg_area.read()
- if ((intersects(seg_area.boundary, trn_area.boundary)) or
- (trn_area.contain_pnt(seg_area.boundary[0], bbox)) or
- (seg_area.contain_pnt(trn_area.boundary[0]))):
+ seg_area.get_points(aline)
+ trn_area.get_points(tline)
+ if ((intersects(aline, tline)) or
+ (trn_area.contain_pnt(aline[0], bbox)) or
+ (seg_area.contain_pnt(tline[0]))):
to_up.append((trn_area.cat, seg_area.cat))
if (cur is not None) and (sql is not None):
cur.executemany(sql, to_up)
return to_up
-def find_lines(table, trn, seg, msgr):
+def find_lines(table, trn, seg):
"""Update the lines' table using the boundaries of the training areas"""
+ msgr = get_msgr()
sql = UPDATE.format(tname=table.name, cat=table.key)
boxlist = BoxList()
n_bounds = len(trn)
@@ -70,24 +76,29 @@
table.conn.commit()
-def find_area(table, trn_ids, trn_area, seg_area, n_areas, seg, msgr):
+def find_area(table, trn_ids, trn_area, seg_area, n_areas, seg):
"""Update the lines' table using the training areas"""
+ msgr = get_msgr()
cur = table.conn.cursor()
msgr.message(_("Finding areas..."))
sql = UPDATE.format(tname=table.name, cat=table.key)
boxlist = BoxList()
+ res = []
for i, trn_id in enumerate(trn_ids):
msgr.percent(i, n_areas, 1)
trn_area.id = trn_id
trn_area.read()
bblist = seg.find['by_box'].areas(trn_area.boundary.bbox(), boxlist,
bboxlist_only=True)
- update_areas(trn_area, seg_area, bblist.ids, cur, sql)
+ res.append(np.array(update_areas(trn_area, seg_area, bblist.ids,
+ cur, sql)))
table.conn.commit()
-def make_new_table(vct, msgr, tname, cols=COLS, force=overwrite()):
+def make_new_table(vct, tname, cols=COLS, force=None):
"""Check/remove/create a new table"""
+ msgr = get_msgr()
+ force = overwrite() if force is None else force
create_link = True
# make a new table
table = Table(tname, vct.table.conn)
@@ -106,9 +117,10 @@
return table, create_link
-def check_balance(table, trntab, msgr):
+def check_balance(table, trntab):
"""Checking the balance between different training classes."""
msg = _('Checking the balance between different training classes.')
+ msgr = get_msgr()
msgr.message(msg)
chk_balance = ("SELECT class, count(*) as num_of_segments "
"FROM {tname} "
@@ -144,20 +156,23 @@
def extract_training(vect, tvect, tlayer):
"""Assign a class to all the areas that contained, are contained
or intersect a training vector"""
- msgr = Messenger()
- with VectorTopo(tvect, mode='r') as trn:
- with VectorTopo(vect, mode='r') as vct:
+ msgr = get_msgr()
+ tname, tmset = tvect.split('@') if '@' in tvect else (tvect, '')
+ vname, vmset = vect.split('@') if '@' in vect else (vect, '')
+ with VectorTopo(tname, tmset, mode='r') as trn:
+ with VectorTopo(vname, vmset, mode='r') as vct:
layer_num, layer_name = get_layer_num_name(vct, tlayer)
# instantiate the area objects
trn_area = Area(c_mapinfo=trn.c_mapinfo)
seg_area = Area(c_mapinfo=vct.c_mapinfo)
n_areas = trn.number_of('areas')
# check/remove/create a new table
- table, create_link = make_new_table(vct, msgr, layer_name)
+ table, create_link = make_new_table(vct, layer_name)
+ find_lines(table, [l for l in trn.viter('lines')], vct)
# find and save all the segments
find_area(table, trn.viter('areas', idonly=True),
- trn_area, seg_area, n_areas, vct, msgr)
- check_balance(table, trn.table, msgr)
+ trn_area, seg_area, n_areas, vct)
+ check_balance(table, trn.table)
if create_link:
msgr.message(_("Connect the new table to the vector map..."))
Modified: grass-addons/grass7/vector/v.class.ml/v.class.ml.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/v.class.ml.py 2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/v.class.ml.py 2014-05-25 19:07:24 UTC (rev 60476)
@@ -69,6 +69,22 @@
#% required: no
#%end
#%option
+#% key: npy_cols
+#% type: string
+#% multiple: no
+#% description: Numpy array with columns names.
+#% answer: cols.npy
+#% required: no
+#%end
+#%option
+#% key: npy_index
+#% type: string
+#% multiple: no
+#% description: Boolean numpy array with training indexes.
+#% answer: indx.npy
+#% required: no
+#%end
+#%option
#% key: npy_tdata
#% type: string
#% multiple: no
@@ -101,6 +117,22 @@
#% required: no
#%end
#%option
+#% key: imp_csv
+#% type: string
+#% multiple: no
+#% description: Feature importances with forests of trees: CSV
+#% answer: features_importances.csv
+#% required: no
+#%end
+#%option
+#% key: imp_fig
+#% type: string
+#% multiple: no
+#% description: Feature importances with forests of trees: figure
+#% answer: features_importances.png
+#% required: no
+#%end
+#%option
#% key: scalar
#% type: string
#% multiple: yes
@@ -145,24 +177,51 @@
#%end
#%option
#% key: nan
-#% type: double
-#% multiple: no
-#% description: Value to use to substitute NaN
+#% type: string
+#% multiple: yes
+#% description: Column pattern:Value or Numpy funtion to use to substitute NaN values
#% required: no
+#% answer: *_skewness:nanmean,*_kurtosis:nanmean
#%end
#%option
#% key: inf
+#% type: string
+#% multiple: yes
+#% description: Key:Value or Numpy funtion to use to substitute NaN values
+#% required: no
+#% answer: *_skewness:nanmean,*_kurtosis:nanmean
+#%end
+#%option
+#% key: neginf
+#% type: string
+#% multiple: yes
+#% description: Key:Value or Numpy funtion to use to substitute NaN values
+#% required: no
+#% answer:
+#%end
+#%option
+#% key: posinf
#% type: double
+#% multiple: yes
+#% description: Key:Value or Numpy funtion to use to substitute NaN values
+#% required: no
+#% answer:
+#%end
+#%option
+#% key: csv_test_cls
+#% type: string
#% multiple: no
-#% description: Value to use to substitute NaN
+#% description: csv file name with results of different machine learning scores
#% required: no
+#% answer: test_classifiers.csv
#%end
#%option
-#% key: csv
+#% key: report_class
#% type: string
#% multiple: no
-#% description: csv file name with tha accuracy of different machine learning
+#% description: csv file name with results of different machine learning scores
#% required: no
+#% answer: classification_report.txt
#%end
#%option
#% key: svc_c_range
@@ -244,6 +303,10 @@
#% description: Export to numpy files
#%end
#%flag
+#% key: f
+#% description: Feature importances with forests of trees
+#%end
+#%flag
#% key: b
#% description: Balance the training using the class with the minor number of areas
#%end
@@ -264,36 +327,28 @@
#% description: Test different classification methods
#%end
#%flag
+#% key: v
+#% description: Bias variance
+#%end
+#%flag
#% key: d
#% description: Explore the SVC domain
#%end
#-----------------------------------------------------
-"""
-v.category input=seg005_64 at pietro layer=1,2,3,4,5,6,7,8,9 type=point,line,centroid,area,face output=seg005_64_new option=transfer
-
-v.category input=seg005_64_new option=report
-
-i.pca -n input=Combabula_Nearmap.red at PERMANENT,Combabula_Nearmap.green at PERMANENT,Combabula_Nearmap.blue at PERMANENT output_prefix=pca
-PC1 2.78 ( 0.5757, 0.5957, 0.5601) [92.83%]
-PC2 0.20 ( 0.6002, 0.1572,-0.7842) [ 6.81%]
-PC3 0.01 ( 0.5552,-0.7877, 0.2670) [ 0.36%]
-
-time r.texture -a input=pca.1 at pietro prefix=pca5_ size=5 --o
-time r.texture -a input=pca.1 at pietro prefix=pca3_ size=3 --o
-echo finish
-"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import imp
import sys
import os
+from pprint import pprint
+from fnmatch import fnmatch
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from grass.pygrass.functions import get_lib_path
-from grass.pygrass.messages import Messenger
+from grass.pygrass.messages import get_msgr
from grass.pygrass.vector import Vector
from grass.pygrass.modules import Module
from grass.script.core import parser, overwrite
@@ -311,8 +366,19 @@
optimize_training, explore_SVC, plot_grid)
from sqlite2npy import save2npy
from npy2table import export_results
+from features import importances, tocsv
+RULES = {'*_skewness': np.nanmean,
+ '*_coeff_var': np.nanmean,
+ '*_stddev': np.nanmean,
+ '*_variance': np.nanmean,
+ '*_mean': np.nanmean,
+ '*_range': np.nanmean,
+ '*_max': np.nanmax,
+ '*_min': np.nanmin, }
+
+
def get_indexes(string, sep=',', rangesep='-'):
"""
>>> indx = '1-5,34-36,40'
@@ -333,8 +399,74 @@
cur = vct.table.execute('SELECT cat, color FROM %s;' % vct.name)
return dict([c for c in cur.fetchall()])
+
+def convert(string):
+ try:
+ return float(string)
+ except:
+ try:
+ return getattr(np, string)
+ except AttributeError:
+ msg = "Not a valid option, is not a number or a numpy function."
+ raise TypeError(msg)
+
+
+def get_rules(string):
+ res = {}
+ pairs = [s.strip().split(':') for s in string.strip().split(',')]
+ for key, val in pairs:
+ res[key] = convert(val)
+ return res
+
+
+def find_special_cols(array, cols, report=True,
+ special=('nan', 'inf', 'neginf', 'posinf')):
+ sp = {key: [] for key in special}
+ cntr = {key: [] for key in special}
+ for i in range(len(cols)):
+ for key in special:
+ barray = getattr(np, 'is%s' % key)(array[:, i])
+ if barray.any():
+ sp[key].append(i)
+ cntr[key].append(barray.sum())
+ if report:
+ indent = ' '
+ tot = len(array)
+ for key in special:
+ fmt = '- %15s (%3d/%d, %4.3f%%)'
+ strs = [fmt % (col, cnt, tot, cnt/float(tot)*100)
+ for col, cnt in zip(cols[np.array(sp[key])], cntr[key])]
+ print('%s:\n%s' % (key, indent), ('\n%s' % indent).join(strs),
+ sep='')
+ return sp
+
+
+def substitute(X, rules, cols):
+ vals = {}
+ special_cols = find_special_cols(X, cols)
+ pprint(special_cols)
+ for key in rules.keys():
+ vals[key] = {}
+ for i in special_cols[key]:
+ for rule in rules[key]:
+ if fnmatch(cols[i], rule):
+ indx = getattr(np, 'is%s' % key)(X[:, i])
+ val = (rules[key][rule] if np.isscalar(rules[key][rule])
+ else rules[key][rule](X[:, i][~indx]))
+ X[:, i][indx] = val
+ vals[key][cols[i]] = val
+ return X, vals
+
+
+def extract_classes(vect, layer):
+ vect, mset = vect.split('@') if '@'in vect else (vect, '')
+ with Vector(vect, mapset=mset, layer=layer, mode='r') as vct:
+ vct.table.filters.select('cat', 'class')
+ return {key: val for key, val in vct.table.execute()}
+
+
def main(opt, flg):
- msgr = Messenger()
+ msgr = get_msgr()
indexes = None
vect = opt['vector']
vtraining = opt['vtraining'] if opt['vtraining'] else None
@@ -343,6 +475,9 @@
tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training'
rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results'
+ labels = extract_classes(vtraining, vlayer)
+ pprint(labels)
+
if opt['scalar']:
scapar = opt['scalar'].split(',')
scaler = StandardScaler(with_mean='with_mean' in scapar,
@@ -355,7 +490,10 @@
if flg['n']:
msgr.message("Save arrays to npy files.")
- save2npy(vect, vlayer, tlayer)
+ save2npy(vect, vlayer, tlayer,
+ fcats=opt['npy_cats'], fcols=opt['npy_cols'],
+ fdata=opt['npy_data'], findx=opt['npy_index'],
+ fclss=opt['npy_tclasses'], ftdata=opt['npy_tdata'])
# define the classifiers to use/test
if opt['pyclassifiers'] and opt['pyvar']:
@@ -378,29 +516,39 @@
indexes = [i for i in get_indexes(opt['pyindx'])]
classifiers = [classifiers[i] for i in indexes]
- csv = open(opt['csv'], 'w') if opt['csv'] else sys.stdout
num = int(opt['n_training']) if opt['n_training'] else None
# load fron npy files
Xt = np.load(opt['npy_tdata'])
Yt = np.load(opt['npy_tclasses'])
- clsses = sorted(set(Yt))
+ cols = np.load(opt['npy_cols'])
- # Substitute NaN
- if opt['nan']:
- msgr.message("Substitute NaN values with: <%g>" % float(opt['nan']))
- Xt[np.isnan(Xt)] = float(opt['nan'])
- if opt['inf']:
- msgr.message("Substitute Inf values with: <%g>" % float(opt['inf']))
- Xt[np.isinf(Xt)] = float(opt['inf'])
+ # Define rules to substitute NaN, Inf, posInf, negInf values
+ rules = {}
+ for key in ('nan', 'inf', 'neginf', 'posinf'):
+ if opt[key]:
+ rules[key] = get_rules(opt[key])
+ pprint(rules)
+ # Substitute (skip cat column)
+ Xt, rules_vals = substitute(Xt, rules, cols[1:])
+
+ # Feature importances with forests of trees
+ if flg['f']:
+ importances(Xt, Yt, cols[1:],
+ csv=opt['imp_csv'], img=opt['imp_fig'],
+ # default parameters to save the matplotlib figure
+ **dict(dpi=300, transparent=False, bbox_inches='tight'))
+
# optimize the training set
if flg['o']:
ind_optimize = (int(opt['pyindx_optimize']) if opt['pyindx_optimize']
else 0)
cls = classifiers[ind_optimize]
msgr.message("Find the optimum training set.")
- best, Xbt, Ybt = optimize_training(cls, Xt, Yt, scaler,
+ best, Xbt, Ybt = optimize_training(cls, Xt, Yt,
+ labels, #{v: k for k, v in labels.items()},
+ scaler,
num=num, maxiterations=1000)
msg = " - save the optimum training data set to: %s."
msgr.message(msg % opt['npy_btdata'])
@@ -435,33 +583,40 @@
msgr.message("Exploring the SVC domain.")
grid = explore_SVC(Xbt, Ybt, n_folds=3, n_jobs=int(opt['svc_n_jobs']),
C=C_range, gamma=gamma_range, kernel=kernel_range)
+ import pickle
+ pkl = open('grid.pkl', 'w')
+ pickle.dump(grid, pkl)
+ pkl.close()
plot_grid(grid, save=opt['svc_img'])
# test the accuracy of different classifiers
if flg['t']:
# test different classifiers
msgr.message("Exploring different classifiers.")
- explorer_clsfiers(classifiers, Xbt, Ybt, Xt, Yt, clsses, indexes, csv)
+ msgr.message("cls_id cls_name mean max min std")
+ #import ipdb; ipdb.set_trace()
+ res = explorer_clsfiers(classifiers, Xt, Yt,
+ indexes=indexes, n_folds=5, bv=flg['v'])
+ # TODO: sort(order=...) is working only in the terminal, why?
+ #res.sort(order='mean')
+ with open(opt['csv_test_cls'], 'w') as csv:
+ csv.write(tocsv(res))
if flg['c']:
# classify
cols = []
data = np.load(opt['npy_data'])
- if opt['nan']:
- msg = "Substitute NaN values with: <%g>" % float(opt['nan'])
- msgr.message(msg)
- data[np.isnan(data)] = float(opt['nan'])
- if opt['inf']:
- msg = "Substitute Inf values with: <%g>" % float(opt['inf'])
- msgr.message(msg)
- data[np.isinf(data)] = float(opt['inf'])
+ pprint(rules_vals)
+ # Substitute (skip cat column)
+ data = substitute(data, rules_vals, cols[1:])
msgr.message("Scaling the whole data set.")
data = scaler.transform(data) if scaler else data
cats = np.load(opt['npy_cats'])
for cls in classifiers:
- run_classifier(cls, Xbt, Ybt, Xt, Yt, clsses, data, save=csv)
+ run_classifier(cls, Xbt, Ybt, Xt, Yt, labels, data,
+ save=opt['report_class'])
cols.append((cls['name'], 'INTEGER'))
# import pickle
@@ -494,6 +649,5 @@
rclrs(map=rst, rules='-', stdin_=rules)
-
if __name__ == "__main__":
main(*parser())
More information about the grass-commit
mailing list