[GRASS-SVN] r61110 - grass-addons/grass7/vector/v.class.ml

Wed Jul 2 00:36:18 PDT 2014

Author: zarch
Date: 2014-07-02 00:36:18 -0700 (Wed, 02 Jul 2014)
New Revision: 61110

Modified:
   grass-addons/grass7/vector/v.class.ml/ml_functions.py
   grass-addons/grass7/vector/v.class.ml/v.class.ml.py
Log:
v.class.ml: Add decompositions methods

Modified: grass-addons/grass7/vector/v.class.ml/ml_functions.py
===================================================================

--- grass-addons/grass7/vector/v.class.ml/ml_functions.py	2014-07-02 03:54:30 UTC (rev 61109)
+++ grass-addons/grass7/vector/v.class.ml/ml_functions.py	2014-07-02 07:36:18 UTC (rev 61110)
@@ -16,7 +16,8 @@
 
 
 from sklearn import metrics as metrics
-from sklearn.metrics import precision_recall_curve as prc, roc_curve, auc
+from sklearn.metrics import (precision_recall_curve as prc, roc_curve, auc,
+                             confusion_matrix)
 from sklearn.cross_validation import StratifiedKFold
 from sklearn.grid_search import GridSearchCV
 from sklearn.svm import SVC
@@ -24,7 +25,9 @@
 
 #from grass.pygrass.messages import get_msgr
 
+CMAP = plt.cm.Blues
 
+
 COLS = [('cat', 'INTEGER PRIMARY KEY'),
         ('class', 'INTEGER'),
         ('color', 'VARCHAR(11)'), ]
@@ -191,19 +194,63 @@
             linestyle=train_stl, linewidth=train_width)
     ax.legend(loc="upper right")
     ax.grid(True, linestyle='-', color='0.75')
-    fig.savefig("bv_%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+    fig.savefig("bv__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
 
 
+def plot_confusion_matrix(cm, labels, name, fmt='png', **kwargs):
+    conf = cm.sum(axis=2)
+    conf /= conf.sum(axis=1)
+    fig, ax = plt.subplots(figsize=(6, 5))
+    img = ax.imshow(conf, cmap=CMAP)
+    fig.colorbar(img)
+    ticks = range(len(labels))
+    ax.set_xticks(ticks)
+    ax.set_xticklabels(labels)
+    ax.xaxis.set_ticks_position("bottom")
+    ax.set_yticks(ticks)
+    ax.set_yticklabels(labels)
+    ax.set_title("Confusion matrix: %s" % name)
+    ax.colorbar()
+    ax.grid(False)
+    ax.set_xlabel('Predicted class')
+    ax.set_ylabel('True class')
+    fig.savefig("confusion_matrix__%s.%s" % (name.replace(" ", "_"), fmt),
+                **kwargs)
+
+
+def plot_pr(precision, recall, pr_score, name, label=None, fmt='png',
+            **kwargs):
+    fig, ax = plt.subplots(figsize=(6, 5))
+    ax.grid()
+    ax.fill_between(recall, precision, alpha=0.5)
+    ax.plot(recall, precision, lw=1)
+    ax.set_xlim([0.0, 1.0])
+    ax.set_ylim([0.0, 1.0])
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    ax.set_title('P/R curve (AUC = %0.2f) / %s vs rest' % (pr_score, label))
+    fig.savefig("pr__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+
+
+def plot_ROC(fpr, tpr, auc_score, name, label, fmt='png', **kwargs):
+    fig, ax = plt.subplots(figsize=(6, 5))
+    ax.grid()
+    ax.plot([0, 1], [0, 1], 'k--')
+    ax.plot(fpr, tpr)
+    ax.fill_between(fpr, tpr, alpha=0.5)
+    ax.set_xlim([0.0, 1.0])
+    ax.set_ylim([0.0, 1.0])
+    ax.set_xlabel('False Positive Rate')
+    ax.set_ylabel('True Positive Rate')
+    ax.set_title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
+    ax.legend(loc="lower right")
+    fig.savefig("roc__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+
+
 def bias_variance_analysis(cls, tdata, tclss, n_folds=5, step=5):
-    clss = sorted(set(tclss))
     num = min([len(tclss[tclss == c]) for c in clss])
-
     clf = cls['classifier'](**cls['kwargs'])
-    keys = ('fprs', 'tprs', 'roc_scores', 'pr_scores', 'precisions',
-            'recalls', 'thresholds')
-
     bv = {}
-    lk = {l: {k: [] for k in keys} for l in clss}
     for n in range(5, num, step):
         X, y = balance(tdata, tclss, n)
         cv = StratifiedKFold(y, n_folds=n_folds)
@@ -225,29 +272,79 @@
             train_errors.append(1 - train_score)
             test_errors.append(1 - test_score)
 
-            # get probability
-            proba = clf.predict_proba(X_test)
-
-            # compute score for each class VS rest
-            for idx, label in enumerate(clss):
-                fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
-                precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
-                lk[label]['fprs'].append(fpr)
-                lk[label]['tprs'].append(tpr)
-                lk[label]['roc_scores'].append(auc(fpr, tpr))
-
-                lk[label]['precisions'].append(precision)
-                lk[label]['recalls'].append(recall)
-                lk[label]['thresholds'].append(pr_thr)
-                lk[label]['pr_scores'].append(auc(recall, precision))
         bv[n] = {'test': np.array(test_errors),
                  'train': np.array(train_errors),
                  'score': np.array(scores)}
     cls['bias variance'] = bv
+
+
+def extra_analysis(cls, tdata, tclss, labels, n_folds=10):
+    clss = sorted(labels.keys())
+    lbs = [labels[cl] for cl in clss]
+    cv = StratifiedKFold(tclss, n_folds=n_folds)
+    keys = ('fprs', 'tprs', 'roc_scores', 'pr_scores', 'precisions',
+            'recalls', 'thresholds')
+    train_errors, test_errors, scores, cms = [], [], [], []
+    lk = {l: {k: [] for k in keys} for l in clss}
+    clf = cls['classifier'](**cls['kwargs'])
+    import ipdb; ipdb.set_trace()
+    for train, test in cv:
+        X_train, y_train = tdata[train], tclss[train]
+        X_test, y_test = tdata[test], tclss[test]
+        # fit train data
+        clf.fit(X_train, y_train)
+
+        train_score = clf.score(X_train, y_train)
+        test_score = clf.score(X_test, y_test)
+        scores.append(test_score)
+
+        train_errors.append(1 - train_score)
+        test_errors.append(1 - test_score)
+
+        y_pred = clf.predict(X_test)
+        cms.append(confusion_matrix(y_test, y_pred, lbs))
+        # get probability
+        proba = clf.predict_proba(X_test)
+        # compute score for each class VS rest
+        for idx, label in enumerate(clss):
+            fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
+            precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
+            lk[label]['fprs'].append(fpr)
+            lk[label]['tprs'].append(tpr)
+            lk[label]['roc_scores'].append(auc(fpr, tpr))
+
+            lk[label]['precisions'].append(precision)
+            lk[label]['recalls'].append(recall)
+            lk[label]['thresholds'].append(pr_thr)
+            lk[label]['pr_scores'].append(auc(recall, precision))
     cls['label scores'] = lk
+    cls['train errors'] = np.array(train_errors)
+    cls['test errors'] = np.array(test_errors)
+    cls['confusion matrix'] = cms
 
 
-def explorer_clsfiers(clsses, Xd, Yd, indexes=None, n_folds=5, bv=False):
+def plot_extra(cls, labels, fmt='png', **kwargs):
+    clss = sorted(labels.keys())
+    lk = cls['label scores']
+    for cl in clss:
+        scores_to_sort = lk[cl]['roc_scores']
+        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
+        name = "%s %s" % (cls['name'], labels[cl])
+        plot_pr(lk[cl]['precisions'][median],
+                lk[cl]['recalls'][median],
+                lk[cl]['pr_scores'][median],
+                name=name, label=labels[cl])
+        plot_ROC(lk[cl]['fprs'][median],
+                 lk[cl]['tprs'][median],
+                 lk[cl]['roc_scores'][median],
+                 name=name, label=labels[cl])
+    plot_confusion_matrix(cls['confusion matrix'],
+                          labels=[labels[cl] for cl in clss],
+                          name=cls['name'])
+
+
+def explorer_clsfiers(clsses, Xd, Yd, labels, indexes=None, n_folds=5,
+                      bv=False, extra=False):
     gen = zip(indexes, clsses) if indexes else enumerate(clsses)
     cv = StratifiedKFold(Yd, n_folds=n_folds)
     fmt = '%5d %-30s %6.4f %6.4f %6.4f %6.4f'
@@ -279,8 +376,12 @@
                                    train_width=1, test_width=1,
                                    train_clr='b', test_clr='r', alpha=0.2,
                                    fmt='png', **kw)
-                with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
-                    pk.dump(cls, pkl)
+            if extra:
+                import ipdb; ipdb.set_trace()
+                extra_analysis(cls, Xd, Yd, labels)
+                plot_extra(cls, labels, **kw)
+            with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
+                pk.dump(cls, pkl)
         except:
             #import ipdb; ipdb.set_trace()
             #print('problem with: %s' % cls['name'])
@@ -304,9 +405,6 @@
         print('Error in: %s' % err['name'])
 
 
-CMAP = plt.cm.Blues
-
-
 def plot_grid(grid, save=''):
     C = grid.param_grid['C']
     gamma = grid.param_grid['gamma']

Modified: grass-addons/grass7/vector/v.class.ml/v.class.ml.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/v.class.ml.py	2014-07-02 03:54:30 UTC (rev 61109)
+++ grass-addons/grass7/vector/v.class.ml/v.class.ml.py	2014-07-02 07:36:18 UTC (rev 61110)
@@ -35,7 +35,7 @@
 #%  key: vlayer
 #%  type: string
 #%  multiple: no
-#%  description: layer name or number to use for the machine learning
+#%  description: layer name or number to use for data
 #%  required: no
 #%end
 #%option
@@ -120,7 +120,7 @@
 #%  key: imp_csv
 #%  type: string
 #%  multiple: no
-#%  description: Feature importances with forests of trees: CSV
+#%  description: CSV file name with the feature importances rank using extra tree algorithms
 #%  answer: features_importances.csv
 #%  required: no
 #%end
@@ -128,7 +128,7 @@
 #%  key: imp_fig
 #%  type: string
 #%  multiple: no
-#%  description: Feature importances with forests of trees: figure
+#%  description: Figure file name with feature importances rank using extra tree algorithms
 #%  answer: features_importances.png
 #%  required: no
 #%end
@@ -141,10 +141,18 @@
 #%  answer: with_mean,with_std
 #%end
 #%option
+#%  key: decomposition
+#%  type: string
+#%  multiple: no
+#%  description: choose a decomposition method (PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, FastICA, TruncatedSVD) and set the parameters using the | to separate the decomposition method from the parameters like: PCA|n_components=98
+#%  required: no
+#%  answer: 
+#%end
+#%option
 #%  key: n_training
 #%  type: integer
 #%  multiple: no
-#%  description: Number of random training to training the machine learning
+#%  description: Number of random training per class to training the machine learning algorithms
 #%  required: no
 #%end
 #%option
@@ -165,7 +173,7 @@
 #%  key: pyindx
 #%  type: string
 #%  multiple: no
-#%  description: specify the index of the classifiers that you want to use
+#%  description: specify the index or range of index of the classifiers that you want to use
 #%  required: no
 #%end
 #%option
@@ -187,7 +195,7 @@
 #%  key: inf
 #%  type: string
 #%  multiple: yes
-#%  description: Key:Value or Numpy funtion to use to substitute NaN values
+#%  description: Key:Value or Numpy funtion to use to substitute Inf values
 #%  required: no
 #%  answer: *_skewness:nanmean,*_kurtosis:nanmean
 #%end
@@ -195,7 +203,7 @@
 #%  key: neginf
 #%  type: string
 #%  multiple: yes
-#%  description: Key:Value or Numpy funtion to use to substitute NaN values
+#%  description: Key:Value or Numpy funtion to use to substitute neginf values
 #%  required: no
 #%  answer:
 #%end
@@ -203,7 +211,7 @@
 #%  key: posinf
 #%  type: double
 #%  multiple: yes
-#%  description: Key:Value or Numpy funtion to use to substitute NaN values
+#%  description: Key:Value or Numpy funtion to use to substitute posinf values
 #%  required: no
 #%  answer:
 #%end
@@ -219,7 +227,7 @@
 #%  key: report_class
 #%  type: string
 #%  multiple: no
-#%  description: csv file name with results of different machine learning scores
+#%  description: text file name with the report of different machine learning algorithms
 #%  required: no
 #%  answer: classification_report.txt
 #%end
@@ -227,7 +235,7 @@
 #%  key: svc_c_range
 #%  type: double
 #%  multiple: yes
-#%  description: C value list
+#%  description: C value range list to explore SVC domain
 #%  required: no
 #%  answer: 1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8
 #%end
@@ -235,7 +243,7 @@
 #%  key: svc_gamma_range
 #%  type: double
 #%  multiple: yes
-#%  description: gamma value list
+#%  description: gamma value range list to explore SVC domain
 #%  required: no
 #%  answer: 1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4
 #%end
@@ -243,7 +251,7 @@
 #%  key: svc_kernel_range
 #%  type: string
 #%  multiple: yes
-#%  description: kernel value list
+#%  description: kernel value range list to explore SVC domain
 #%  required: no
 #%  answer: linear,poly,rbf,sigmoid
 #%end
@@ -251,7 +259,7 @@
 #%  key: svc_n_jobs
 #%  type: integer
 #%  multiple: no
-#%  description: number of jobs
+#%  description: number of jobs to use during the domain exploration
 #%  required: no
 #%  answer: 1
 #%end
@@ -259,21 +267,21 @@
 #%  key: svc_c
 #%  type: double
 #%  multiple: no
-#%  description: C value
+#%  description: definitive C value
 #%  required: no
 #%end
 #%option
 #%  key: svc_gamma
 #%  type: double
 #%  multiple: no
-#%  description: gamma value
+#%  description: definitive gamma value
 #%  required: no
 #%end
 #%option
 #%  key: svc_kernel
 #%  type: string
 #%  multiple: no
-#%  description: Available kernel are: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
+#%  description: definitive kernel value. Available kernel are: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
 #%  required: no
 #%  answer: rbf
 #%end
@@ -281,7 +289,7 @@
 #%  key: svc_img
 #%  type: string
 #%  multiple: no
-#%  description: filename with the image od SVC parameter
+#%  description: filename pattern with the image of SVC parameter
 #%  required: no
 #%  answer: domain_%s.svg
 #%end
@@ -289,7 +297,7 @@
 #%  key: rst_names
 #%  type: string
 #%  multiple: no
-#%  description: filename with the image od SVC parameter
+#%  description: filename pattern for raster
 #%  required: no
 #%  answer: %s
 #%end
@@ -304,15 +312,15 @@
 #%end
 #%flag
 #%  key: f
-#%  description: Feature importances with forests of trees
+#%  description: Feature importances using extra trees algorithm
 #%end
 #%flag
 #%  key: b
-#%  description: Balance the training using the class with the minor number of areas
+#%  description: Balance the training using the class with the minor number of data
 #%end
 #%flag
 #%  key: o
-#%  description: optimize the training samples
+#%  description: Optimize the training samples
 #%end
 #%flag
 #%  key: c
@@ -320,7 +328,7 @@
 #%end
 #%flag
 #%  key: r
-#%  description: Export the classify resutls to raster maps
+#%  description: Export the classify results to raster maps
 #%end
 #%flag
 #%  key: t
@@ -328,9 +336,13 @@
 #%end
 #%flag
 #%  key: v
-#%  description: Bias variance
+#%  description: add to test to compute the Bias variance
 #%end
 #%flag
+#%  key: x
+#%  description: add to test to compute extra parameters like: confusion matrix, ROC, PR
+#%end
+#%flag
 #%  key: d
 #%  description: Explore the SVC domain
 #%end
@@ -368,7 +380,11 @@
 from npy2table import export_results
 from features import importances, tocsv
 
+from sklearn.decomposition import (PCA, KernelPCA, ProbabilisticPCA,
+                                   RandomizedPCA, FastICA, TruncatedSVD)
+from sklearn.lda import LDA
 
+
 RULES = {'*_skewness': np.nanmean,
          '*_coeff_var': np.nanmean,
          '*_stddev': np.nanmean,
@@ -379,6 +395,15 @@
          '*_min': np.nanmin, }
 
 
+DECMP = {'PCA': PCA,
+         'KernelPCA': KernelPCA,
+         'ProbabilisticPCA': ProbabilisticPCA,
+         'RandomizedPCA': RandomizedPCA,
+         'FastICA': FastICA,
+         'TruncatedSVD': TruncatedSVD,
+         'LDA': LDA}
+
+
 def get_indexes(string, sep=',', rangesep='-'):
     """
     >>> indx = '1-5,34-36,40'
@@ -482,6 +507,14 @@
         scapar = opt['scalar'].split(',')
         scaler = StandardScaler(with_mean='with_mean' in scapar,
                                 with_std='with_std' in scapar)
+
+    if opt['decomposition']:
+        decmp, params = (opt['decomposition'].split('|') 
+                         if '|' in opt['decomposition'] 
+                         else (opt['decomposition'], ''))
+        kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))}
+                  if params else {})
+        dec = DECMP[decmp](**kwargs)
     # if training extract training
     if vtraining and flg['e']:
         msgr.message("Extract training from: <%s>." % vtraining)
@@ -580,6 +613,7 @@
         C_range = [float(c) for c in opt['svc_c_range'].split(',')]
         gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',')]
         kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',')]
+        poly_range = [int(i) for i in opt['poly_range'].split(',')]
         msgr.message("Exploring the SVC domain.")
         grid = explore_SVC(Xbt, Ybt, n_folds=3, n_jobs=int(opt['svc_n_jobs']),
                            C=C_range, gamma=gamma_range, kernel=kernel_range)
@@ -595,8 +629,9 @@
         msgr.message("Exploring different classifiers.")
         msgr.message("cls_id   cls_name          mean     max     min     std")
         #import ipdb; ipdb.set_trace()
-        res = explorer_clsfiers(classifiers, Xt, Yt,
-                                indexes=indexes, n_folds=5, bv=flg['v'])
+        res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels,
+                                indexes=indexes, n_folds=5,
+                                bv=flg['v'], extra=flg['x'])
         # TODO: sort(order=...) is working only in the terminal, why?
         #res.sort(order='mean')
         with open(opt['csv_test_cls'], 'w') as csv: