[GRASS-SVN] r60476 - grass-addons/grass7/vector/v.class.ml

svn_grass at osgeo.org svn_grass at osgeo.org
Sun May 25 12:07:24 PDT 2014


Author: zarch
Date: 2014-05-25 12:07:24 -0700 (Sun, 25 May 2014)
New Revision: 60476

Added:
   grass-addons/grass7/vector/v.class.ml/features.py
Modified:
   grass-addons/grass7/vector/v.class.ml/Makefile
   grass-addons/grass7/vector/v.class.ml/ml_classifiers.py
   grass-addons/grass7/vector/v.class.ml/ml_functions.py
   grass-addons/grass7/vector/v.class.ml/sqlite2npy.py
   grass-addons/grass7/vector/v.class.ml/training_extraction.py
   grass-addons/grass7/vector/v.class.ml/v.class.ml.py
Log:
Add new classifiers and new options

Modified: grass-addons/grass7/vector/v.class.ml/Makefile
===================================================================
--- grass-addons/grass7/vector/v.class.ml/Makefile	2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/Makefile	2014-05-25 19:07:24 UTC (rev 60476)
@@ -2,7 +2,8 @@
 
 PGM = v.class.ml
 
-ETCFILES = training_extraction ml_classifiers ml_functions sqlite2npy npy2table
+ETCFILES = training_extraction ml_classifiers ml_functions \
+           sqlite2npy npy2table features
 
 include $(MODULE_TOPDIR)/include/Make/Script.make
 include $(MODULE_TOPDIR)/include/Make/Python.make

Added: grass-addons/grass7/vector/v.class.ml/features.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/features.py	                        (rev 0)
+++ grass-addons/grass7/vector/v.class.ml/features.py	2014-05-25 19:07:24 UTC (rev 60476)
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# -- coding: utf-8 --
+from __future__ import (absolute_import, division, print_function)
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.ensemble import ExtraTreesClassifier
+
+
+N_ESTIMATORS = 500
+RANDOM_STATE = 0
+
+
+IMP_DTYPE = [('col', '<U24'), ('imp', 'f'), ('std', 'f')]
+
+
+def tocsv(array, sep=';', fmt='%s'):
+    return '\n'.join([sep.join([fmt % el for el in row]) for row in array])
+
+
+def importances(X, y, cols, csv='', img='',
+                clf=ExtraTreesClassifier(n_estimators=N_ESTIMATORS,
+                                         random_state=RANDOM_STATE),
+                **savefig):
+    clf.fit(X, y)
+    imp = clf.feature_importances_
+    std = np.std([est.feature_importances_ for est in clf.estimators_], axis=0)
+    res = np.array([(c, i, s) for c, i, s in zip(cols, imp, std)],
+                   dtype=IMP_DTYPE)
+    res.sort(order='imp')
+    res = res[::-1]
+    if csv:
+        with open(csv, 'w') as csv:
+            csv.write(tocsv(res))
+    if img:
+        fig, ax = plt.subplots(figsize=(5, 40))
+        pos = range(len(cols))
+        ax.barh(pos, res['imp'] * 100, align='center', alpha=0.4)
+        ax.set_yticks(pos)
+        ax.set_yticklabels(res['col'])
+        ax.set_xlabel('Importance [%]')
+        ax.set_title("Feature importances")
+        ax.grid()
+        fig.savefig(img, **savefig)
+    return res

Modified: grass-addons/grass7/vector/v.class.ml/ml_classifiers.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_classifiers.py	2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/ml_classifiers.py	2014-05-25 19:07:24 UTC (rev 60476)
@@ -9,13 +9,18 @@
 from gettext import lgettext as _
 
 from sklearn.linear_model import SGDClassifier
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import (AdaBoostClassifier,
+                              ExtraTreesClassifier,
+                              GradientBoostingClassifier,
+                              RandomForestClassifier,
+                              RandomTreesEmbedding)
 from sklearn.neighbors import (NearestNeighbors,
                                KNeighborsClassifier,
                                RadiusNeighborsClassifier,
                                NearestCentroid)
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
+from sklearn import metrics
 
 
 from grass.pygrass.messages import Messenger
@@ -85,44 +90,7 @@
      'kwargs': {'n_neighbors': 8, 'weights': 'distance'}},
     {'name': 'knn16_distance', 'classifier': KNeighborsClassifier,
      'kwargs': {'n_neighbors': 16, 'weights': 'distance'}},
-    # radius
-    {'name': 'knn_radius_0p5_uniform',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 0.5, 'weights': 'uniform'}},
-    {'name': 'knn_radius_1_uniform',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 1., 'weights': 'uniform'}},
-    {'name': 'knn_radius_1p5_uniform',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 1.5, 'weights': 'uniform'}},
-    {'name': 'knn_radius_2_uniform',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 2., 'weights': 'uniform'}},
-    {'name': 'knn_radius_2p5_uniform',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 2.5, 'weights': 'uniform'}},
-    {'name': 'knn_radius_5_uniform',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 5., 'weights': 'uniform'}},
 
-    {'name': 'knn_radius_0p5_distance',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 0.5, 'weights': 'distance'}},
-    {'name': 'knn_radius_1_distance',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 1., 'weights': 'distance'}},
-    {'name': 'knn_radius_1p5_distance',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 1.5, 'weights': 'distance'}},
-    {'name': 'knn_radius_2_distance',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 2., 'weights': 'distance'}},
-    {'name': 'knn_radius_2p5_distance',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 2.5, 'weights': 'distance'}},
-    {'name': 'knn_radius_5_distance',
-     'classifier': RadiusNeighborsClassifier,
-     'kwargs': {'radius': 5., 'weights': 'distance'}},
     # centroid
     # ‘euclidean’, ‘l2’, ‘l1’, ‘manhattan’, ‘cityblock’
     #  [‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘cosine’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]
@@ -161,25 +129,25 @@
 
     {'name': 'knn_centroid_manhattan_none', 'classifier': NearestCentroid,
      'kwargs': {'metric': 'manhattan', 'shrink_threshold ': None}},
-    {'name': 'knn_centroid_manhattan_0p5', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 0.5}},
-    {'name': 'knn_centroid_manhattan_1', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.0}},
-    {'name': 'knn_centroid_manhattan_1p5', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.5}},
-    {'name': 'knn_centroid_manhattan_2', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 2.0}},
+#    {'name': 'knn_centroid_manhattan_0p5', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 0.5}},
+#    {'name': 'knn_centroid_manhattan_1', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.0}},
+#    {'name': 'knn_centroid_manhattan_1p5', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 1.5}},
+#    {'name': 'knn_centroid_manhattan_2', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'manhattan', 'shrink_threshold ': 2.0}},
 
     {'name': 'knn_centroid_cityblock_none', 'classifier': NearestCentroid,
      'kwargs': {'metric': 'cityblock', 'shrink_threshold ': None}},
-    {'name': 'knn_centroid_cityblock_0p5', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 0.5}},
-    {'name': 'knn_centroid_cityblock_1', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.0}},
-    {'name': 'knn_centroid_cityblock_1p5', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.5}},
-    {'name': 'knn_centroid_cityblock_2', 'classifier': NearestCentroid,
-     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 2.0}},
+#    {'name': 'knn_centroid_cityblock_0p5', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 0.5}},
+#    {'name': 'knn_centroid_cityblock_1', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.0}},
+#    {'name': 'knn_centroid_cityblock_1p5', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 1.5}},
+#    {'name': 'knn_centroid_cityblock_2', 'classifier': NearestCentroid,
+#     'kwargs': {'metric': 'cityblock', 'shrink_threshold ': 2.0}},
     #
     # Tree
     #
@@ -192,12 +160,12 @@
      'kwargs': {'criterion': 'gini', 'max_depth': 'sqrt'}},
     {'name': 'd_tree_gini_log2', 'classifier': DecisionTreeClassifier,
      'kwargs': {'criterion': 'gini', 'max_depth': 'log2'}},
-    {'name': 'd_tree_gini_0p25', 'classifier': DecisionTreeClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 0.25}},
-    {'name': 'd_tree_gini_0p50', 'classifier': DecisionTreeClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 0.5}},
-    {'name': 'd_tree_gini_0p75', 'classifier': DecisionTreeClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 0.75}},
+#    {'name': 'd_tree_gini_0p25', 'classifier': DecisionTreeClassifier,
+#     'kwargs': {'criterion': 'gini', 'max_depth': 0.25}},
+#    {'name': 'd_tree_gini_0p50', 'classifier': DecisionTreeClassifier,
+#     'kwargs': {'criterion': 'gini', 'max_depth': 0.5}},
+#    {'name': 'd_tree_gini_0p75', 'classifier': DecisionTreeClassifier,
+#     'kwargs': {'criterion': 'gini', 'max_depth': 0.75}},
 
     {'name': 'd_tree_entropy', 'classifier': DecisionTreeClassifier,
      'kwargs': {'criterion': 'entropy', 'splitter': 'best', 'max_depth': None,
@@ -208,46 +176,156 @@
      'kwargs': {'criterion': 'entropy', 'max_depth': 'sqrt'}},
     {'name': 'd_tree_entropy_log2', 'classifier': DecisionTreeClassifier,
      'kwargs': {'criterion': 'entropy', 'max_depth': 'log2'}},
-    {'name': 'd_tree_entropy_0p25', 'classifier': DecisionTreeClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 0.25}},
-    {'name': 'd_tree_entropy_0p50', 'classifier': DecisionTreeClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 0.5}},
-    {'name': 'd_tree_entropy_0p75', 'classifier': DecisionTreeClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 0.75}},
+#    {'name': 'd_tree_entropy_0p25', 'classifier': DecisionTreeClassifier,
+#     'kwargs': {'criterion': 'entropy', 'max_depth': 0.25}},
+#    {'name': 'd_tree_entropy_0p50', 'classifier': DecisionTreeClassifier,
+#     'kwargs': {'criterion': 'entropy', 'max_depth': 0.5}},
+#    {'name': 'd_tree_entropy_0p75', 'classifier': DecisionTreeClassifier,
+#     'kwargs': {'criterion': 'entropy', 'max_depth': 0.75}},
 
+    #
+    # Forest
+    #
     {'name': 'rand_tree_gini', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'gini', 'splitter': 'best', 'max_depth': None,
+     'kwargs': {'criterion': 'gini', 'max_depth': None,
                 'min_samples_split': 2, 'min_samples_leaf': 1,
                 'max_features': None, 'random_state': None,
                 'min_density': None}},
     {'name': 'rand_tree_gini_sqrt', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 'sqrt'}},
+     'kwargs': {'criterion': 'gini', 'max_depth': None,
+                'min_samples_split': 2, 'min_samples_leaf': 1,
+                'max_features': 'sqrt', 'random_state': None,
+                'min_density': None}},
     {'name': 'rand_tree_gini_log2', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 'log2'}},
+     'kwargs': {'criterion': 'gini', 'max_depth': None,
+                'min_samples_split': 2, 'min_samples_leaf': 1,
+                'max_features': 'log2', 'random_state': None,
+                'min_density': None}},
+    {'name': 'rand_tree_gini_0p05', 'classifier': RandomForestClassifier,
+     'kwargs': {'criterion': 'gini', 'max_features': 0.05}},
     {'name': 'rand_tree_gini_0p25', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 0.25}},
+     'kwargs': {'criterion': 'gini', 'max_features': 0.25}},
     {'name': 'rand_tree_gini_0p50', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 0.5}},
+     'kwargs': {'criterion': 'gini', 'max_features': 0.5}},
     {'name': 'rand_tree_gini_0p75', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'gini', 'max_depth': 0.75}},
+     'kwargs': {'criterion': 'gini', 'max_features': 0.75}},
 
     {'name': 'rand_tree_entropy', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'entropy', 'splitter': 'best', 'max_depth': None,
+     'kwargs': {'criterion': 'entropy', 'max_depth': None,
                 'min_samples_split': 2, 'min_samples_leaf': 1,
                 'max_features': None, 'random_state': None,
                 'min_density': None}},
     {'name': 'rand_tree_entropy_sqrt', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 'sqrt'}},
+     'kwargs': {'criterion': 'entropy', 'max_depth': None,
+                'min_samples_split': 2, 'min_samples_leaf': 1,
+                'max_features': 'sqrt', 'random_state': None,
+                'min_density': None}},
     {'name': 'rand_tree_entropy_log2', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 'log2'}},
+     'kwargs': {'criterion': 'entropy', 'max_depth': None,
+                'min_samples_split': 2, 'min_samples_leaf': 1,
+                'max_features': 'log2', 'random_state': None,
+                'min_density': None}},
     {'name': 'rand_tree_entropy_0p25', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 0.25}},
+     'kwargs': {'criterion': 'entropy', 'max_features': 0.25}},
     {'name': 'rand_tree_entropy_0p50', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 0.5}},
+     'kwargs': {'criterion': 'entropy', 'max_features': 0.5}},
     {'name': 'rand_tree_entropy_0p75', 'classifier': RandomForestClassifier,
-     'kwargs': {'criterion': 'entropy', 'max_depth': 0.75}},
+     'kwargs': {'criterion': 'entropy', 'max_features': 0.75}},
 
+#    # RandomTreesEmbedding
+#    {'name': 'rand_tree_emb_10_5', 'classifier': RandomTreesEmbedding,
+#     'kwargs': dict(n_estimators=10, max_depth=5, min_samples_split=2,
+#                    min_samples_leaf=1, n_jobs=-1, random_state=None, verbose=0,
+#                    min_density=None)},
+#    {'name': 'rand_tree_emb_10_5_leaf3', 'classifier': RandomTreesEmbedding,
+#     'kwargs': dict(n_estimators=10, max_depth=5, min_samples_split=2,
+#                    min_samples_leaf=3, n_jobs=1, random_state=None, verbose=0,
+#                    min_density=None)},
+#    {'name': 'rand_tree_emb_10_50', 'classifier': RandomTreesEmbedding,
+#     'kwargs': dict(n_estimators=10, max_depth=50, min_samples_split=2,
+#                    min_samples_leaf=1, n_jobs=1, random_state=None, verbose=0,
+#                    min_density=None)},
+#    {'name': 'rand_tree_emb_100_50', 'classifier': RandomTreesEmbedding,
+#     'kwargs': dict(n_estimators=100, max_depth=50, min_samples_split=2,
+#                    min_samples_leaf=1, n_jobs=1, random_state=None, verbose=0,
+#                    min_density=None)},
+#    {'name': 'rand_tree_emb_100_50', 'classifier': RandomTreesEmbedding,
+#     'kwargs': dict(n_estimators=100, max_depth=50, min_samples_split=2,
+#                    min_samples_leaf=3, n_jobs=1, random_state=None, verbose=0,
+#                    min_density=None)},
+
     #
+    # AdaBoost classifier
+    #
+    {'name': 'ada_50_1.0', 'classifier': AdaBoostClassifier,
+     'kwargs': dict(base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',
+                    max_depth=1, max_features=None, min_density=None,
+                    min_samples_leaf=1, min_samples_split=2, random_state=None,
+                    splitter='best'),
+                    n_estimators=50, learning_rate=1.0,
+                    algorithm='SAMME.R', random_state=None)},
+    {'name': 'ada_50_1.0_minleaf3', 'classifier': AdaBoostClassifier,
+     'kwargs': dict(base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',
+                    max_depth=1, max_features=None, min_density=None,
+                    min_samples_leaf=3, min_samples_split=2, random_state=None,
+                    splitter='best'),
+                    n_estimators=50, learning_rate=1.0,
+                    algorithm='SAMME.R', random_state=None)},
+    {'name': 'ada_50_0.5', 'classifier': AdaBoostClassifier,
+     'kwargs': dict(base_estimator=DecisionTreeClassifier(compute_importances=None, criterion='gini',
+                    max_depth=1, max_features=None, min_density=None,
+                    min_samples_leaf=1, min_samples_split=2, random_state=None,
+                    splitter='best'),
+                    n_estimators=50, learning_rate=0.5,
+                    algorithm='SAMME.R', random_state=None)},
+
+    {'name': 'extra_tree_10_1', 'classifier': ExtraTreesClassifier,
+     'kwargs': dict(n_estimators=10, criterion='gini', max_depth=None,
+                    min_samples_split=2, min_samples_leaf=1,
+                    max_features='auto', bootstrap=False, oob_score=False,
+                    n_jobs=1, random_state=None, verbose=0, min_density=None,
+                    compute_importances=None)},
+    {'name': 'extra_tree_10_3', 'classifier': ExtraTreesClassifier,
+     'kwargs': dict(n_estimators=10, criterion='gini', max_depth=None,
+                    min_samples_split=2, min_samples_leaf=3,
+                    max_features='auto', bootstrap=False, oob_score=False,
+                    n_jobs=1, random_state=None, verbose=0, min_density=None,
+                    compute_importances=None)},
+    {'name': 'extra_tree_100_1', 'classifier': ExtraTreesClassifier,
+     'kwargs': dict(n_estimators=100, criterion='gini', max_depth=None,
+                    min_samples_split=2, min_samples_leaf=1,
+                    max_features='auto', bootstrap=False, oob_score=False,
+                    n_jobs=1, random_state=None, verbose=0, min_density=None,
+                    compute_importances=None)},
+    {'name': 'extra_tree_100_3', 'classifier': ExtraTreesClassifier,
+     'kwargs': dict(n_estimators=100, criterion='gini', max_depth=None,
+                    min_samples_split=2, min_samples_leaf=3,
+                    max_features='auto', bootstrap=False, oob_score=False,
+                    n_jobs=1, random_state=None, verbose=0, min_density=None,
+                    compute_importances=None)},
+    {'name': 'extra_tree_100_5', 'classifier': ExtraTreesClassifier,
+     'kwargs': dict(n_estimators=100, criterion='gini', max_depth=None,
+                    min_samples_split=2, min_samples_leaf=5,
+                    max_features='auto', bootstrap=False, oob_score=False,
+                    n_jobs=1, random_state=None, verbose=0, min_density=None,
+                    compute_importances=None)},
+    {'name': 'gradient_boost_100_minleaf1', 'classifier': GradientBoostingClassifier,
+     'kwargs': dict(loss='deviance', learning_rate=0.1, n_estimators=100,
+                    subsample=1.0, min_samples_split=2, min_samples_leaf=1,
+                    max_depth=3, init=None, random_state=None,
+                    max_features=None, verbose=0)},
+    {'name': 'gradient_boost_100_meanleaf3', 'classifier': GradientBoostingClassifier,
+     'kwargs': dict(loss='deviance', learning_rate=0.1, n_estimators=100,
+                    subsample=1.0, min_samples_split=2, min_samples_leaf=3,
+                    max_depth=3, init=None, random_state=None,
+                    max_features=None, verbose=0)},
+    {'name': 'gradient_boost_100_meanleaf5', 'classifier': GradientBoostingClassifier,
+     'kwargs': dict(loss='deviance', learning_rate=0.1, n_estimators=100,
+                    subsample=1.0, min_samples_split=2, min_samples_leaf=5,
+                    max_depth=3, init=None, random_state=None,
+                    max_features=None, verbose=0)},
+
+    #
     # Gausian
     #
     {'name': 'gaussianNB', 'classifier': GaussianNB},
@@ -390,4 +468,4 @@
         # 'classifier': MLPYWrapper(mlpy.MaximumLikelihoodC)},
     ]
     # add MLPY
-    CLASSIFIERS.extend(MLPY_CLS)
+    #CLASSIFIERS.extend(MLPY_CLS)

Modified: grass-addons/grass7/vector/v.class.ml/ml_functions.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_functions.py	2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/ml_functions.py	2014-05-25 19:07:24 UTC (rev 60476)
@@ -4,32 +4,40 @@
 
 @author: pietro
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import (absolute_import, division, print_function)
 import time
 import random as rnd
 from gettext import lgettext as _
 import sys
+import pickle as pk
 
 import numpy as np
-import pylab as pl
+import matplotlib.pyplot as plt
 
 
-from sklearn.metrics import accuracy_score
+from sklearn import metrics as metrics
+from sklearn.metrics import precision_recall_curve as prc, roc_curve, auc
 from sklearn.cross_validation import StratifiedKFold
 from sklearn.grid_search import GridSearchCV
 from sklearn.svm import SVC
+from sklearn.cross_validation import cross_val_score
 
-from grass.pygrass.messages import Messenger
+#from grass.pygrass.messages import get_msgr
 
-MSGR = Messenger()
 
-
 COLS = [('cat', 'INTEGER PRIMARY KEY'),
         ('class', 'INTEGER'),
         ('color', 'VARCHAR(11)'), ]
 
 
+SCORES_DTYPE = [('index', 'i'),
+                ('name', '<U32'),
+                ('mean', 'f'),
+                ('max', 'f'),
+                ('min', 'f'),
+                ('std', 'f')]
+
+
 def print_cols(clss, sep=';', save=sys.stdout):
     clsses = sorted(set(clss))
     cols = ['ml_index', 'ml_name', 'fit_time', 'prediction_time',
@@ -50,26 +58,29 @@
     print(sep.join(res), file=save)
 
 
-def accuracy(sol, cls=None, data=None, clss=None, pred=None):
+def accuracy(sol, cls=None, data=None, labels=None, pred=None):
     cls = cls if cls else dict()
-    clsses = clss if clss else sorted(set(sol))
+    clsses = sorted(labels.keys())
     if 'cls' in cls:
         cls['pred_start'] = time.time()
         pred = cls['cls'].predict(data)
         cls['pred_stop'] = time.time()
 
-    cls['t_acc'] = accuracy_score(sol, pred, normalize=True)
+    cls['t_acc'] = metrics.accuracy_score(sol, pred, normalize=True)
+    lab = [labels[key] for key in clsses]
+    cls['report'] = metrics.classification_report(sol, pred, lab)
+    cls['confusion'] = metrics.confusion_matrix(sol, pred, lab)
     c_acc = []
     for c in clsses:
-        indx = sol == c
-        c_acc.append(accuracy_score(sol[indx], pred[indx],
-                                    normalize=True))
+        indx = (sol == c).nonzero()
+        c_acc.append(metrics.accuracy_score(sol[indx], pred[indx],
+                                            normalize=True))
     cls['c_acc'] = np.array(c_acc)
     cls['c_acc_mean'] = cls['c_acc'].mean()
     return cls
 
 
-def test_classifier(cls, Xt, Yt, Xd, Yd, clss, save=sys.stdout,
+def test_classifier(cls, Xt, Yt, Xd, Yd, labels, save=sys.stdout,
                     verbose=True):
     cls['cls'] = cls['classifier'](**cls.get('kwargs', {}))
     cls['fit_start'] = time.time()
@@ -79,18 +90,20 @@
         cls['params'] = cls['cls'].get_params()
     except AttributeError:
         cls['params'] = None
-    accuracy(Yd, cls, Xd, clss)
+    accuracy(Yd, cls, Xd, labels)
     if verbose:
         print_test(cls, save=save)
 
 
-def run_classifier(cls, Xt, Yt, Xd, Yd, clss, data,
-                   save=sys.stdout):
-    test_classifier(cls, Xt, Yt, Xd, Yd, clss, verbose=False)
+def run_classifier(cls, Xt, Yt, Xd, Yd, labels, data,
+                   report=sys.stdout):
+    test_classifier(cls, Xt, Yt, Xd, Yd, labels, verbose=False)
     cls['pred_start'] = time.time()
     cls['predict'] = cls['cls'].predict(data)
     cls['pred_stop'] = time.time()
-    print_test(cls, save=save)
+    print_test(cls, save=report)
+    report.write('\n' + cls['report'])
+    report.write('\n' + cls['confusion'])
     np.save(cls['name'] + '.npy', cls['predict'])
 
 
@@ -119,13 +132,14 @@
     return bdata, bclss
 
 
-def optimize_training(cls, tdata, tclss,
+def optimize_training(cls, tdata, tclss, labels,
                       scaler=None, num=None, maxiterations=1000):
     best = cls.copy()
     best['c_acc_mean'] = 0
     means = []
+    #msgr = get_msgr()
     for i in range(maxiterations):  # TODO: use multicore
-        MSGR.percent(i, maxiterations, 1)
+        #msgr.percent(i, maxiterations, 1)
         Xt, Yt = balance(tdata, tclss, num)
         if scaler:
             scaler.fit(Xt, Yt)
@@ -133,7 +147,7 @@
             stdata = scaler.transform(tdata)
         else:
             sXt, stdata = Xt, tdata
-        test_classifier(cls, sXt, Yt, stdata, tclss, None, verbose=False)
+        test_classifier(cls, sXt, Yt, stdata, tclss, labels, verbose=False)
         if cls['c_acc_mean'] > best['c_acc_mean']:
             print("%f > %f" % (cls['c_acc_mean'], best['c_acc_mean']))
             best = cls.copy()
@@ -149,7 +163,132 @@
     return best, bXt, bYt
 
 
-def explorer_clsfiers(clsses, Xt, Yt, Xd, Yd, clss,
+def plot_bias_variance(data_sizes, train_errors, test_errors, name,
+                       title="Bias-Variance for '%s'",
+                       train_err_std=None, test_err_std=None,
+                       train_stl='-', test_stl='-',
+                       train_width=1, test_width=1,
+                       train_clr='b', test_clr='r', alpha=0.2,
+                       fmt='png', **kwargs):
+    fig, ax = plt.subplots(figsize=(6, 5))
+    ax.set_ylim([0.0, 1.0])
+    ax.set_xlabel('Data set size')
+    ax.set_ylabel('Error')
+    ax.set_title(title % name)
+    if train_err_std is not None:
+        ax.fill_between(data_sizes,
+                        train_errors - train_err_std,
+                        train_errors + train_err_std,
+                        facecolor=train_clr, alpha=alpha)
+    if test_err_std is not None:
+        ax.fill_between(data_sizes,
+                        test_errors - test_err_std,
+                        test_errors + test_err_std,
+                        facecolor=test_clr, alpha=alpha)
+    ax.plot(data_sizes, test_errors, label="test error", color=test_clr,
+            linestyle=test_stl, linewidth=test_width)
+    ax.plot(data_sizes, train_errors, label="train error", color=train_clr,
+            linestyle=train_stl, linewidth=train_width)
+    ax.legend(loc="upper right")
+    ax.grid(True, linestyle='-', color='0.75')
+    fig.savefig("bv_%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+
+
+def bias_variance_analysis(cls, tdata, tclss, n_folds=5, step=5):
+    clss = sorted(set(tclss))
+    num = min([len(tclss[tclss == c]) for c in clss])
+
+    clf = cls['classifier'](**cls['kwargs'])
+    keys = ('fprs', 'tprs', 'roc_scores', 'pr_scores', 'precisions',
+            'recalls', 'thresholds')
+
+    bv = {}
+    lk = {l: {k: [] for k in keys} for l in clss}
+    for n in range(5, num, step):
+        X, y = balance(tdata, tclss, n)
+        cv = StratifiedKFold(y, n_folds=n_folds)
+        # instantiate empty lists
+        train_errors, test_errors, scores = [], [], []
+        for train, test in cv:
+            X_train, y_train = X[train], y[train]
+            X_test, y_test = X[test], y[test]
+
+            # fit train data
+            clf.fit(X_train, y_train)
+
+            # get score
+            train_score = clf.score(X_train, y_train)
+            test_score = clf.score(X_test, y_test)
+            scores.append(test_score)
+
+            # get errors
+            train_errors.append(1 - train_score)
+            test_errors.append(1 - test_score)
+
+            # get probability
+            proba = clf.predict_proba(X_test)
+
+            # compute score for each class VS rest
+            for idx, label in enumerate(clss):
+                fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
+                precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
+                lk[label]['fprs'].append(fpr)
+                lk[label]['tprs'].append(tpr)
+                lk[label]['roc_scores'].append(auc(fpr, tpr))
+
+                lk[label]['precisions'].append(precision)
+                lk[label]['recalls'].append(recall)
+                lk[label]['thresholds'].append(pr_thr)
+                lk[label]['pr_scores'].append(auc(recall, precision))
+        bv[n] = {'test': np.array(test_errors),
+                 'train': np.array(train_errors),
+                 'score': np.array(scores)}
+    cls['bias variance'] = bv
+    cls['label scores'] = lk
+
+
+def explorer_clsfiers(clsses, Xd, Yd, indexes=None, n_folds=5, bv=False):
+    gen = zip(indexes, clsses) if indexes else enumerate(clsses)
+    cv = StratifiedKFold(Yd, n_folds=n_folds)
+    fmt = '%5d %-30s %6.4f %6.4f %6.4f %6.4f'
+    res = []
+    kw = dict(bbox_inches="tight", dpi=300)
+    for index, cls in gen:
+        try:
+            cls['scores'] = cross_val_score(cls['classifier'](**cls['kwargs']),
+                                            Xd, Yd, cv=cv, n_jobs=1, verbose=0)
+            # TODO: if n_jobs == -1 raise:
+            # AttributeError: '_MainProcess' object has no attribute '_daemonic'
+            mean, mx, mn, st = (cls['scores'].mean(), cls['scores'].max(),
+                                cls['scores'].min(), cls['scores'].std())
+            vals = (index, cls['name'], mean, mx, mn, st)
+            print(fmt % vals)
+            res.append(vals)
+            if bv:
+                bias_variance_analysis(cls, Xd, Yd, n_folds=5, step=5)
+                bv = cls['bias variance']
+                data_sizes = np.array(sorted(bv.keys()))
+                test = np.array([bv[i]['test'] for i in data_sizes])
+                train = np.array([bv[i]['train'] for i in data_sizes])
+                plot_bias_variance(data_sizes,
+                                   train.mean(axis=1), test.mean(axis=1),
+                                   cls['name'], "Bias-Variance for '%s'",
+                                   train_err_std=train.std(axis=1),
+                                   test_err_std=test.std(axis=1),
+                                   train_stl='-', test_stl='-',
+                                   train_width=1, test_width=1,
+                                   train_clr='b', test_clr='r', alpha=0.2,
+                                   fmt='png', **kw)
+                with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
+                    pk.dump(cls, pkl)
+        except:
+            #import ipdb; ipdb.set_trace()
+            #print('problem with: %s' % cls['name'])
+            pass
+    return np.array(res, dtype=SCORES_DTYPE)
+
+
+def explorer_clsfiers_old(clsses, Xt, Yt, Xd, Yd, clss,
                       indexes=None, csv=sys.stdout):
     errors = []
     gen = zip(indexes, clsses) if indexes else enumerate(clsses)
@@ -165,37 +304,46 @@
         print('Error in: %s' % err['name'])
 
 
+CMAP = plt.cm.Blues
+
+
 def plot_grid(grid, save=''):
     C = grid.param_grid['C']
     gamma = grid.param_grid['gamma']
-
-    for kernel in grid.param_grid['kernel']:
+    kernels = grid.param_grid['kernel']
+    for kernel in kernels:
         scores = [x[1] for x in grid.grid_scores_ if x[0]['kernel'] == kernel]
         scores = np.array(scores).reshape(len(C), len(gamma))
         # draw heatmap of accuracy as a function of gamma and C
-        pl.figure(figsize=(8, 6))
-        pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
-        pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
-        pl.xlabel(r'$\gamma$')
-        pl.ylabel('C')
-        pl.colorbar()
-        pl.xticks(np.arange(len(gamma)), gamma, rotation=45)
-        pl.yticks(np.arange(len(C)), C)
+        #pl.figure(figsize=(8, 6))
+        #pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
+        fig, ax = plt.subplots()
+        img = ax.imshow(scores, interpolation='nearest', cmap=CMAP)
+        ax.set_xlabel(r'$\gamma$')
+        ax.set_ylabel('C')
+        ax.set_xticks(np.arange(len(gamma)))
+        ax.set_xticklabels(gamma, rotation=45)
+        ax.set_yticks(np.arange(len(C)))
+        ax.set_yticklabels(C)
+#        if kernel == 'poly':
+#            import ipdb; ipdb.set_trace()
         ic, igamma = np.unravel_index(np.argmax(scores), scores.shape)
-        pl.plot(igamma, ic, 'r.')
-        best = scores[igamma, ic]
-        titl = r"$best:\, %0.4f, \,C:\, %g, \,\gamma: \,%g$" % (best,
-                                                             C[ic],
-                                                             gamma[igamma])
-        pl.title(titl)
+        ax.plot(igamma, ic, 'r.')
+        best = scores[ic, igamma]
+        titl = r"%s $best:\, %0.4f, \,C:\, %g, \,\gamma: \,%g$" % (kernel.title(),
+                                                                   best, C[ic],
+                                                                   gamma[igamma])
+        ax.set_title(titl)
+        fig.colorbar(img)
         if save:
-            pl.savefig(save, dpi=600, trasparent=True, bbox_inches='tight')
-        pl.show()
+            fig.savefig(save % kernel, dpi=600, trasparent=True, bbox_inches='tight')
+        fig.show()
 
 
 def explore_SVC(Xt, Yt, n_folds=3, n_jobs=1, **kwargs):
     cv = StratifiedKFold(y=Yt, n_folds=n_folds)
-    grid = GridSearchCV(SVC(), param_grid=kwargs, cv=cv, n_jobs=n_jobs)
+    grid = GridSearchCV(SVC(), param_grid=kwargs, cv=cv, n_jobs=n_jobs,
+                        verbose=2)
     grid.fit(Xt, Yt)
     print("The best classifier is: ", grid.best_estimator_)
     return grid

Modified: grass-addons/grass7/vector/v.class.ml/sqlite2npy.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/sqlite2npy.py	2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/sqlite2npy.py	2014-05-25 19:07:24 UTC (rev 60476)
@@ -10,6 +10,7 @@
 from grass.pygrass.vector import VectorTopo
 
 FCATS = 'cats.npy'
+FCOLS = 'cols.npy'
 FDATA = 'data.npy'
 FINDX = 'indx.npy'
 FCLSS = 'training_classes.npy'
@@ -32,22 +33,23 @@
     return dt
 
 
-def save2npy(vect, l_data, l_trning,
-             fcats=FCATS, fdata=FDATA, findx=FINDX,
+def save2npy(vect, l_data, l_trn,
+             fcats=FCATS, fcols=FCOLS, fdata=FDATA, findx=FINDX,
              fclss=FCLSS, ftdata=FTDATA):
-    """Return 5 arrays:
+    """Return 6 arrays:
         - categories,
+        - columns name,
         - data,
         - a boolean array with the training,
-        - the training classes
-        - the training data
+        - the training classes,
+        - the training data.
     """
     with VectorTopo(vect, mode='r') as vct:
         # instantiate the tables
-        data = (vct.dblinks.by_layer(l_data).table() if l_data.isdigit()
+        data = (vct.dblinks.by_layer(int(l_data)).table() if l_data.isdigit()
                 else vct.dblinks.by_name(l_data).table())
-        trng = (vct.dblinks.by_layer(l_trning).table() if l_trning.isdigit()
-                else vct.dblinks.by_name(l_trning).table())
+        trng = (vct.dblinks.by_layer(int(l_trn)).table() if l_trn.isdigit()
+                else vct.dblinks.by_name(l_trn).table())
 
         # check the dimensions
         n_trng, n_data = trng.n_rows(), data.n_rows()
@@ -68,9 +70,10 @@
 
         # extract the data
         data_cols = data.columns.names()
+        cols = np.array(data_cols)
         data_cols.remove(data.key)
-        cols = ', '.join(data_cols)
-        slct_data = "SELECT {cols} FROM {tname};".format(cols=cols,
+        scols = ', '.join(data_cols)
+        slct_data = "SELECT {cols} FROM {tname};".format(cols=scols,
                                                          tname=data.name)
         shape = (n_data, len(data_cols))
         # use the function to be more memory efficient
@@ -89,11 +92,12 @@
 
         # save
         np.save(fcats, cats)
+        np.save(fcols, cols)
         np.save(fdata, dta)
         np.save(findx, trn_indxs)
         np.save(fclss, trn_ind)
         np.save(ftdata, trn_dta)
-        return cats, dta, trn_indxs, trn_ind, trn_dta
+        return cats, cols, dta, trn_indxs, trn_ind, trn_dta
 
 
 def load_from_npy(fcats=FCATS, fdata=FDATA, findx=FINDX,

Modified: grass-addons/grass7/vector/v.class.ml/training_extraction.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/training_extraction.py	2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/training_extraction.py	2014-05-25 19:07:24 UTC (rev 60476)
@@ -13,9 +13,9 @@
 from grass.script.core import overwrite
 from grass.pygrass.vector import VectorTopo, Vector
 from grass.pygrass.vector.table import Link, Table
-from grass.pygrass.vector.geometry import Area, intersects
+from grass.pygrass.vector.geometry import Line, Area, intersects
 from grass.pygrass.vector.basic import Bbox, BoxList
-from grass.pygrass.messages import Messenger
+from grass.pygrass.messages import get_msgr
 
 
 COLS = [('cat', 'INTEGER PRIMARY KEY'),
@@ -29,9 +29,10 @@
     """
     to_up = []
     bbox = Bbox()
+    aline = Line()
     for area in alist:
         bbox = area.bbox(bbox)
-        if ((intersects(area.boundary, line)) or
+        if ((intersects(area.get_points(aline), line)) or
                 (area.contain_pnt(line[0], bbox))):
             to_up.append((line.cat, area.cat))
     if (cur is not None) and (sql is not None):
@@ -45,20 +46,25 @@
     """
     to_up = []
     bbox = trn_area.bbox()
+    aline = Line()
+    tline = Line()
     for s_id in ids:
         seg_area.id = s_id
         seg_area.read()
-        if ((intersects(seg_area.boundary, trn_area.boundary)) or
-                (trn_area.contain_pnt(seg_area.boundary[0], bbox)) or
-                (seg_area.contain_pnt(trn_area.boundary[0]))):
+        seg_area.get_points(aline)
+        trn_area.get_points(tline)
+        if ((intersects(aline, tline)) or
+                (trn_area.contain_pnt(aline[0], bbox)) or
+                (seg_area.contain_pnt(tline[0]))):
             to_up.append((trn_area.cat, seg_area.cat))
     if (cur is not None) and (sql is not None):
         cur.executemany(sql, to_up)
     return to_up
 
 
-def find_lines(table, trn, seg, msgr):
+def find_lines(table, trn, seg):
     """Update the lines' table using the boundaries of the training areas"""
+    msgr = get_msgr()
     sql = UPDATE.format(tname=table.name, cat=table.key)
     boxlist = BoxList()
     n_bounds = len(trn)
@@ -70,24 +76,29 @@
     table.conn.commit()
 
 
-def find_area(table, trn_ids, trn_area, seg_area, n_areas, seg, msgr):
+def find_area(table, trn_ids, trn_area, seg_area, n_areas, seg):
     """Update the lines' table using the training areas"""
+    msgr = get_msgr()
     cur = table.conn.cursor()
     msgr.message(_("Finding areas..."))
     sql = UPDATE.format(tname=table.name, cat=table.key)
     boxlist = BoxList()
+    res = []
     for i, trn_id in enumerate(trn_ids):
         msgr.percent(i, n_areas, 1)
         trn_area.id = trn_id
         trn_area.read()
         bblist = seg.find['by_box'].areas(trn_area.boundary.bbox(), boxlist,
                                           bboxlist_only=True)
-        update_areas(trn_area, seg_area, bblist.ids, cur, sql)
+        res.append(np.array(update_areas(trn_area, seg_area, bblist.ids,
+                                         cur, sql)))
     table.conn.commit()
 
 
-def make_new_table(vct, msgr, tname, cols=COLS, force=overwrite()):
+def make_new_table(vct, tname, cols=COLS, force=None):
     """Check/remove/create a new table"""
+    msgr = get_msgr()
+    force = overwrite() if force is None else force
     create_link = True
     # make a new table
     table = Table(tname, vct.table.conn)
@@ -106,9 +117,10 @@
     return table, create_link
 
 
-def check_balance(table, trntab, msgr):
+def check_balance(table, trntab):
     """Checking the balance between different training classes."""
     msg = _('Checking the balance between different training classes.')
+    msgr = get_msgr()
     msgr.message(msg)
     chk_balance = ("SELECT class, count(*) as num_of_segments "
                    "FROM {tname} "
@@ -144,20 +156,23 @@
 def extract_training(vect, tvect, tlayer):
     """Assign a class to all the areas that contained, are contained
     or intersect a training vector"""
-    msgr = Messenger()
-    with VectorTopo(tvect, mode='r') as trn:
-        with VectorTopo(vect, mode='r') as vct:
+    msgr = get_msgr()
+    tname, tmset = tvect.split('@') if '@' in tvect else (tvect, '')
+    vname, vmset = vect.split('@') if '@' in vect else (vect, '')
+    with VectorTopo(tname, tmset, mode='r') as trn:
+        with VectorTopo(vname, vmset, mode='r') as vct:
             layer_num, layer_name = get_layer_num_name(vct, tlayer)
             # instantiate the area objects
             trn_area = Area(c_mapinfo=trn.c_mapinfo)
             seg_area = Area(c_mapinfo=vct.c_mapinfo)
             n_areas = trn.number_of('areas')
             # check/remove/create a new table
-            table, create_link = make_new_table(vct, msgr, layer_name)
+            table, create_link = make_new_table(vct, layer_name)
+            find_lines(table, [l for l in trn.viter('lines')], vct)
             # find and save all the segments
             find_area(table, trn.viter('areas', idonly=True),
-                      trn_area, seg_area, n_areas, vct, msgr)
-            check_balance(table, trn.table, msgr)
+                      trn_area, seg_area, n_areas, vct)
+            check_balance(table, trn.table)
 
     if create_link:
         msgr.message(_("Connect the new table to the vector map..."))

Modified: grass-addons/grass7/vector/v.class.ml/v.class.ml.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/v.class.ml.py	2014-05-25 13:12:57 UTC (rev 60475)
+++ grass-addons/grass7/vector/v.class.ml/v.class.ml.py	2014-05-25 19:07:24 UTC (rev 60476)
@@ -69,6 +69,22 @@
 #%  required: no
 #%end
 #%option
+#%  key: npy_cols
+#%  type: string
+#%  multiple: no
+#%  description: Numpy array with columns names.
+#%  answer: cols.npy
+#%  required: no
+#%end
+#%option
+#%  key: npy_index
+#%  type: string
+#%  multiple: no
+#%  description: Boolean numpy array with training indexes.
+#%  answer: indx.npy
+#%  required: no
+#%end
+#%option
 #%  key: npy_tdata
 #%  type: string
 #%  multiple: no
@@ -101,6 +117,22 @@
 #%  required: no
 #%end
 #%option
+#%  key: imp_csv
+#%  type: string
+#%  multiple: no
+#%  description: Feature importances with forests of trees: CSV
+#%  answer: features_importances.csv
+#%  required: no
+#%end
+#%option
+#%  key: imp_fig
+#%  type: string
+#%  multiple: no
+#%  description: Feature importances with forests of trees: figure
+#%  answer: features_importances.png
+#%  required: no
+#%end
+#%option
 #%  key: scalar
 #%  type: string
 #%  multiple: yes
@@ -145,24 +177,51 @@
 #%end
 #%option
 #%  key: nan
-#%  type: double
-#%  multiple: no
-#%  description: Value to use to substitute NaN
+#%  type: string
+#%  multiple: yes
+#%  description: Column pattern:Value or Numpy funtion to use to substitute NaN values
 #%  required: no
+#%  answer: *_skewness:nanmean,*_kurtosis:nanmean
 #%end
 #%option
 #%  key: inf
+#%  type: string
+#%  multiple: yes
+#%  description: Key:Value or Numpy funtion to use to substitute NaN values
+#%  required: no
+#%  answer: *_skewness:nanmean,*_kurtosis:nanmean
+#%end
+#%option
+#%  key: neginf
+#%  type: string
+#%  multiple: yes
+#%  description: Key:Value or Numpy funtion to use to substitute NaN values
+#%  required: no
+#%  answer:
+#%end
+#%option
+#%  key: posinf
 #%  type: double
+#%  multiple: yes
+#%  description: Key:Value or Numpy funtion to use to substitute NaN values
+#%  required: no
+#%  answer:
+#%end
+#%option
+#%  key: csv_test_cls
+#%  type: string
 #%  multiple: no
-#%  description: Value to use to substitute NaN
+#%  description: csv file name with results of different machine learning scores
 #%  required: no
+#%  answer: test_classifiers.csv
 #%end
 #%option
-#%  key: csv
+#%  key: report_class
 #%  type: string
 #%  multiple: no
-#%  description: csv file name with tha accuracy of different machine learning
+#%  description: csv file name with results of different machine learning scores
 #%  required: no
+#%  answer: classification_report.txt
 #%end
 #%option
 #%  key: svc_c_range
@@ -244,6 +303,10 @@
 #%  description: Export to numpy files
 #%end
 #%flag
+#%  key: f
+#%  description: Feature importances with forests of trees
+#%end
+#%flag
 #%  key: b
 #%  description: Balance the training using the class with the minor number of areas
 #%end
@@ -264,36 +327,28 @@
 #%  description: Test different classification methods
 #%end
 #%flag
+#%  key: v
+#%  description: Bias variance
+#%end
+#%flag
 #%  key: d
 #%  description: Explore the SVC domain
 #%end
 #-----------------------------------------------------
-"""
-v.category input=seg005_64 at pietro layer=1,2,3,4,5,6,7,8,9 type=point,line,centroid,area,face output=seg005_64_new option=transfer
-
-v.category input=seg005_64_new option=report
-
-i.pca -n input=Combabula_Nearmap.red at PERMANENT,Combabula_Nearmap.green at PERMANENT,Combabula_Nearmap.blue at PERMANENT output_prefix=pca
-PC1      2.78 ( 0.5757, 0.5957, 0.5601) [92.83%]
-PC2      0.20 ( 0.6002, 0.1572,-0.7842) [ 6.81%]
-PC3      0.01 ( 0.5552,-0.7877, 0.2670) [ 0.36%]
-
-time r.texture -a input=pca.1 at pietro prefix=pca5_ size=5 --o
-time r.texture -a input=pca.1 at pietro prefix=pca3_ size=3 --o
-echo finish
-"""
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import imp
 import sys
 import os
+from pprint import pprint
+from fnmatch import fnmatch
 
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
 
 from grass.pygrass.functions import get_lib_path
-from grass.pygrass.messages import Messenger
+from grass.pygrass.messages import get_msgr
 from grass.pygrass.vector import Vector
 from grass.pygrass.modules import Module
 from grass.script.core import parser, overwrite
@@ -311,8 +366,19 @@
                           optimize_training, explore_SVC, plot_grid)
 from sqlite2npy import save2npy
 from npy2table import export_results
+from features import importances, tocsv
 
 
+RULES = {'*_skewness': np.nanmean,
+         '*_coeff_var': np.nanmean,
+         '*_stddev': np.nanmean,
+         '*_variance': np.nanmean,
+         '*_mean': np.nanmean,
+         '*_range': np.nanmean,
+         '*_max': np.nanmax,
+         '*_min': np.nanmin, }
+
+
 def get_indexes(string, sep=',', rangesep='-'):
     """
     >>> indx = '1-5,34-36,40'
@@ -333,8 +399,74 @@
         cur = vct.table.execute('SELECT cat, color FROM %s;' % vct.name)
         return dict([c for c in cur.fetchall()])
 
+
+def convert(string):
+    try:
+        return float(string)
+    except:
+        try:
+            return getattr(np, string)
+        except AttributeError:
+            msg = "Not a valid option, is not a number or a numpy function."
+            raise TypeError(msg)
+
+
+def get_rules(string):
+    res = {}
+    pairs = [s.strip().split(':') for s in string.strip().split(',')]
+    for key, val in pairs:
+        res[key] = convert(val)
+    return res
+
+
+def find_special_cols(array, cols, report=True,
+                      special=('nan', 'inf', 'neginf', 'posinf')):
+    sp = {key: [] for key in special}
+    cntr = {key: [] for key in special}
+    for i in range(len(cols)):
+        for key in special:
+            barray = getattr(np, 'is%s' % key)(array[:, i])
+            if barray.any():
+                sp[key].append(i)
+                cntr[key].append(barray.sum())
+    if report:
+        indent = '    '
+        tot = len(array)
+        for key in special:
+            fmt = '- %15s (%3d/%d, %4.3f%%)'
+            strs = [fmt % (col, cnt, tot, cnt/float(tot)*100)
+                    for col, cnt in zip(cols[np.array(sp[key])], cntr[key])]
+            print('%s:\n%s' % (key, indent), ('\n%s' % indent).join(strs),
+                  sep='')
+    return sp
+
+
+def substitute(X, rules, cols):
+    vals = {}
+    special_cols = find_special_cols(X, cols)
+    pprint(special_cols)
+    for key in rules.keys():
+        vals[key] = {}
+        for i in special_cols[key]:
+            for rule in rules[key]:
+                if fnmatch(cols[i], rule):
+                    indx = getattr(np, 'is%s' % key)(X[:, i])
+                    val = (rules[key][rule] if np.isscalar(rules[key][rule])
+                           else rules[key][rule](X[:, i][~indx]))
+                    X[:, i][indx] = val
+                    vals[key][cols[i]] = val
+    return X, vals
+
+
+def extract_classes(vect, layer):
+    vect, mset = vect.split('@') if '@'in vect else (vect, '')
+    with Vector(vect, mapset=mset, layer=layer, mode='r') as vct:
+        vct.table.filters.select('cat', 'class')
+        return {key: val for key, val in vct.table.execute()}
+
+
 def main(opt, flg):
-    msgr = Messenger()
+    msgr = get_msgr()
     indexes = None
     vect = opt['vector']
     vtraining = opt['vtraining'] if opt['vtraining'] else None
@@ -343,6 +475,9 @@
     tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training'
     rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results'
 
+    labels = extract_classes(vtraining, vlayer)
+    pprint(labels)
+
     if opt['scalar']:
         scapar = opt['scalar'].split(',')
         scaler = StandardScaler(with_mean='with_mean' in scapar,
@@ -355,7 +490,10 @@
 
     if flg['n']:
         msgr.message("Save arrays to npy files.")
-        save2npy(vect, vlayer, tlayer)
+        save2npy(vect, vlayer, tlayer,
+                 fcats=opt['npy_cats'], fcols=opt['npy_cols'],
+                 fdata=opt['npy_data'], findx=opt['npy_index'],
+                 fclss=opt['npy_tclasses'], ftdata=opt['npy_tdata'])
 
     # define the classifiers to use/test
     if opt['pyclassifiers'] and opt['pyvar']:
@@ -378,29 +516,39 @@
         indexes = [i for i in get_indexes(opt['pyindx'])]
         classifiers = [classifiers[i] for i in indexes]
 
-    csv = open(opt['csv'], 'w') if opt['csv'] else sys.stdout
     num = int(opt['n_training']) if opt['n_training'] else None
 
     # load fron npy files
     Xt = np.load(opt['npy_tdata'])
     Yt = np.load(opt['npy_tclasses'])
-    clsses = sorted(set(Yt))
+    cols = np.load(opt['npy_cols'])
 
-    # Substitute NaN
-    if opt['nan']:
-        msgr.message("Substitute NaN values with: <%g>" % float(opt['nan']))
-        Xt[np.isnan(Xt)] = float(opt['nan'])
-    if opt['inf']:
-        msgr.message("Substitute Inf values with: <%g>" % float(opt['inf']))
-        Xt[np.isinf(Xt)] = float(opt['inf'])
+    # Define rules to substitute NaN, Inf, posInf, negInf values
+    rules = {}
+    for key in ('nan', 'inf', 'neginf', 'posinf'):
+        if opt[key]:
+            rules[key] = get_rules(opt[key])
+    pprint(rules)
 
+    # Substitute (skip cat column)
+    Xt, rules_vals = substitute(Xt, rules, cols[1:])
+
+    # Feature importances with forests of trees
+    if flg['f']:
+        importances(Xt, Yt, cols[1:],
+                    csv=opt['imp_csv'], img=opt['imp_fig'],
+                    # default parameters to save the matplotlib figure
+                    **dict(dpi=300, transparent=False, bbox_inches='tight'))
+
     # optimize the training set
     if flg['o']:
         ind_optimize = (int(opt['pyindx_optimize']) if opt['pyindx_optimize']
                         else 0)
         cls = classifiers[ind_optimize]
         msgr.message("Find the optimum training set.")
-        best, Xbt, Ybt = optimize_training(cls, Xt, Yt, scaler,
+        best, Xbt, Ybt = optimize_training(cls, Xt, Yt,
+                                           labels, #{v: k for k, v in labels.items()},
+                                           scaler,
                                            num=num, maxiterations=1000)
         msg = "    - save the optimum training data set to: %s."
         msgr.message(msg % opt['npy_btdata'])
@@ -435,33 +583,40 @@
         msgr.message("Exploring the SVC domain.")
         grid = explore_SVC(Xbt, Ybt, n_folds=3, n_jobs=int(opt['svc_n_jobs']),
                            C=C_range, gamma=gamma_range, kernel=kernel_range)
+        import pickle
+        pkl = open('grid.pkl', 'w')
+        pickle.dump(grid, pkl)
+        pkl.close()
         plot_grid(grid, save=opt['svc_img'])
 
     # test the accuracy of different classifiers
     if flg['t']:
         # test different classifiers
         msgr.message("Exploring different classifiers.")
-        explorer_clsfiers(classifiers, Xbt, Ybt, Xt, Yt, clsses, indexes, csv)
+        msgr.message("cls_id   cls_name          mean     max     min     std")
+        #import ipdb; ipdb.set_trace()
+        res = explorer_clsfiers(classifiers, Xt, Yt,
+                                indexes=indexes, n_folds=5, bv=flg['v'])
+        # TODO: sort(order=...) is working only in the terminal, why?
+        #res.sort(order='mean')
+        with open(opt['csv_test_cls'], 'w') as csv:
+            csv.write(tocsv(res))
 
     if flg['c']:
         # classify
         cols = []
         data = np.load(opt['npy_data'])
-        if opt['nan']:
-            msg = "Substitute NaN values with: <%g>" % float(opt['nan'])
-            msgr.message(msg)
-            data[np.isnan(data)] = float(opt['nan'])
-        if opt['inf']:
-            msg = "Substitute Inf values with: <%g>" % float(opt['inf'])
-            msgr.message(msg)
-            data[np.isinf(data)] = float(opt['inf'])
+        pprint(rules_vals)
+        # Substitute (skip cat column)
+        data = substitute(data, rules_vals, cols[1:])
 
         msgr.message("Scaling the whole data set.")
         data = scaler.transform(data) if scaler else data
         cats = np.load(opt['npy_cats'])
 
         for cls in classifiers:
-            run_classifier(cls, Xbt, Ybt, Xt, Yt, clsses, data, save=csv)
+            run_classifier(cls, Xbt, Ybt, Xt, Yt, labels, data,
+                           save=opt['report_class'])
             cols.append((cls['name'], 'INTEGER'))
 
 #        import pickle
@@ -494,6 +649,5 @@
                 rclrs(map=rst, rules='-', stdin_=rules)
 
 
-
 if __name__ == "__main__":
     main(*parser())



More information about the grass-commit mailing list