[GRASS-SVN] r61110 - grass-addons/grass7/vector/v.class.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Wed Jul 2 00:36:18 PDT 2014
Author: zarch
Date: 2014-07-02 00:36:18 -0700 (Wed, 02 Jul 2014)
New Revision: 61110
Modified:
grass-addons/grass7/vector/v.class.ml/ml_functions.py
grass-addons/grass7/vector/v.class.ml/v.class.ml.py
Log:
v.class.ml: Add decompositions methods
Modified: grass-addons/grass7/vector/v.class.ml/ml_functions.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/ml_functions.py 2014-07-02 03:54:30 UTC (rev 61109)
+++ grass-addons/grass7/vector/v.class.ml/ml_functions.py 2014-07-02 07:36:18 UTC (rev 61110)
@@ -16,7 +16,8 @@
from sklearn import metrics as metrics
-from sklearn.metrics import precision_recall_curve as prc, roc_curve, auc
+from sklearn.metrics import (precision_recall_curve as prc, roc_curve, auc,
+ confusion_matrix)
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
@@ -24,7 +25,9 @@
#from grass.pygrass.messages import get_msgr
+CMAP = plt.cm.Blues
+
COLS = [('cat', 'INTEGER PRIMARY KEY'),
('class', 'INTEGER'),
('color', 'VARCHAR(11)'), ]
@@ -191,19 +194,63 @@
linestyle=train_stl, linewidth=train_width)
ax.legend(loc="upper right")
ax.grid(True, linestyle='-', color='0.75')
- fig.savefig("bv_%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+ fig.savefig("bv__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+def plot_confusion_matrix(cm, labels, name, fmt='png', **kwargs):
+ conf = cm.sum(axis=2)
+ conf /= conf.sum(axis=1)
+ fig, ax = plt.subplots(figsize=(6, 5))
+ img = ax.imshow(conf, cmap=CMAP)
+ fig.colorbar(img)
+ ticks = range(len(labels))
+ ax.set_xticks(ticks)
+ ax.set_xticklabels(labels)
+ ax.xaxis.set_ticks_position("bottom")
+ ax.set_yticks(ticks)
+ ax.set_yticklabels(labels)
+ ax.set_title("Confusion matrix: %s" % name)
+ ax.colorbar()
+ ax.grid(False)
+ ax.set_xlabel('Predicted class')
+ ax.set_ylabel('True class')
+ fig.savefig("confusion_matrix__%s.%s" % (name.replace(" ", "_"), fmt),
+ **kwargs)
+
+
+def plot_pr(precision, recall, pr_score, name, label=None, fmt='png',
+ **kwargs):
+ fig, ax = plt.subplots(figsize=(6, 5))
+ ax.grid()
+ ax.fill_between(recall, precision, alpha=0.5)
+ ax.plot(recall, precision, lw=1)
+ ax.set_xlim([0.0, 1.0])
+ ax.set_ylim([0.0, 1.0])
+ ax.set_xlabel('Recall')
+ ax.set_ylabel('Precision')
+ ax.set_title('P/R curve (AUC = %0.2f) / %s vs rest' % (pr_score, label))
+ fig.savefig("pr__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+
+
+def plot_ROC(fpr, tpr, auc_score, name, label, fmt='png', **kwargs):
+ fig, ax = plt.subplots(figsize=(6, 5))
+ ax.grid()
+ ax.plot([0, 1], [0, 1], 'k--')
+ ax.plot(fpr, tpr)
+ ax.fill_between(fpr, tpr, alpha=0.5)
+ ax.set_xlim([0.0, 1.0])
+ ax.set_ylim([0.0, 1.0])
+ ax.set_xlabel('False Positive Rate')
+ ax.set_ylabel('True Positive Rate')
+ ax.set_title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
+ ax.legend(loc="lower right")
+ fig.savefig("roc__%s.%s" % (name.replace(" ", "_"), fmt), **kwargs)
+
+
def bias_variance_analysis(cls, tdata, tclss, n_folds=5, step=5):
- clss = sorted(set(tclss))
num = min([len(tclss[tclss == c]) for c in clss])
-
clf = cls['classifier'](**cls['kwargs'])
- keys = ('fprs', 'tprs', 'roc_scores', 'pr_scores', 'precisions',
- 'recalls', 'thresholds')
-
bv = {}
- lk = {l: {k: [] for k in keys} for l in clss}
for n in range(5, num, step):
X, y = balance(tdata, tclss, n)
cv = StratifiedKFold(y, n_folds=n_folds)
@@ -225,29 +272,79 @@
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
- # get probability
- proba = clf.predict_proba(X_test)
-
- # compute score for each class VS rest
- for idx, label in enumerate(clss):
- fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
- precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
- lk[label]['fprs'].append(fpr)
- lk[label]['tprs'].append(tpr)
- lk[label]['roc_scores'].append(auc(fpr, tpr))
-
- lk[label]['precisions'].append(precision)
- lk[label]['recalls'].append(recall)
- lk[label]['thresholds'].append(pr_thr)
- lk[label]['pr_scores'].append(auc(recall, precision))
bv[n] = {'test': np.array(test_errors),
'train': np.array(train_errors),
'score': np.array(scores)}
cls['bias variance'] = bv
+
+
+def extra_analysis(cls, tdata, tclss, labels, n_folds=10):
+ clss = sorted(labels.keys())
+ lbs = [labels[cl] for cl in clss]
+ cv = StratifiedKFold(tclss, n_folds=n_folds)
+ keys = ('fprs', 'tprs', 'roc_scores', 'pr_scores', 'precisions',
+ 'recalls', 'thresholds')
+ train_errors, test_errors, scores, cms = [], [], [], []
+ lk = {l: {k: [] for k in keys} for l in clss}
+ clf = cls['classifier'](**cls['kwargs'])
+ import ipdb; ipdb.set_trace()
+ for train, test in cv:
+ X_train, y_train = tdata[train], tclss[train]
+ X_test, y_test = tdata[test], tclss[test]
+ # fit train data
+ clf.fit(X_train, y_train)
+
+ train_score = clf.score(X_train, y_train)
+ test_score = clf.score(X_test, y_test)
+ scores.append(test_score)
+
+ train_errors.append(1 - train_score)
+ test_errors.append(1 - test_score)
+
+ y_pred = clf.predict(X_test)
+ cms.append(confusion_matrix(y_test, y_pred, lbs))
+ # get probability
+ proba = clf.predict_proba(X_test)
+ # compute score for each class VS rest
+ for idx, label in enumerate(clss):
+ fpr, tpr, roc_thr = roc_curve(y_test, proba[:, idx], label)
+ precision, recall, pr_thr = prc(y_test, proba[:, idx], label)
+ lk[label]['fprs'].append(fpr)
+ lk[label]['tprs'].append(tpr)
+ lk[label]['roc_scores'].append(auc(fpr, tpr))
+
+ lk[label]['precisions'].append(precision)
+ lk[label]['recalls'].append(recall)
+ lk[label]['thresholds'].append(pr_thr)
+ lk[label]['pr_scores'].append(auc(recall, precision))
cls['label scores'] = lk
+ cls['train errors'] = np.array(train_errors)
+ cls['test errors'] = np.array(test_errors)
+ cls['confusion matrix'] = cms
-def explorer_clsfiers(clsses, Xd, Yd, indexes=None, n_folds=5, bv=False):
+def plot_extra(cls, labels, fmt='png', **kwargs):
+ clss = sorted(labels.keys())
+ lk = cls['label scores']
+ for cl in clss:
+ scores_to_sort = lk[cl]['roc_scores']
+ median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
+ name = "%s %s" % (cls['name'], labels[cl])
+ plot_pr(lk[cl]['precisions'][median],
+ lk[cl]['recalls'][median],
+ lk[cl]['pr_scores'][median],
+ name=name, label=labels[cl])
+ plot_ROC(lk[cl]['fprs'][median],
+ lk[cl]['tprs'][median],
+ lk[cl]['roc_scores'][median],
+ name=name, label=labels[cl])
+ plot_confusion_matrix(cls['confusion matrix'],
+ labels=[labels[cl] for cl in clss],
+ name=cls['name'])
+
+
+def explorer_clsfiers(clsses, Xd, Yd, labels, indexes=None, n_folds=5,
+ bv=False, extra=False):
gen = zip(indexes, clsses) if indexes else enumerate(clsses)
cv = StratifiedKFold(Yd, n_folds=n_folds)
fmt = '%5d %-30s %6.4f %6.4f %6.4f %6.4f'
@@ -279,8 +376,12 @@
train_width=1, test_width=1,
train_clr='b', test_clr='r', alpha=0.2,
fmt='png', **kw)
- with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
- pk.dump(cls, pkl)
+ if extra:
+ import ipdb; ipdb.set_trace()
+ extra_analysis(cls, Xd, Yd, labels)
+ plot_extra(cls, labels, **kw)
+ with open("%s.pkl" % cls['name'].replace(' ', '_'), 'wb') as pkl:
+ pk.dump(cls, pkl)
except:
#import ipdb; ipdb.set_trace()
#print('problem with: %s' % cls['name'])
@@ -304,9 +405,6 @@
print('Error in: %s' % err['name'])
-CMAP = plt.cm.Blues
-
-
def plot_grid(grid, save=''):
C = grid.param_grid['C']
gamma = grid.param_grid['gamma']
Modified: grass-addons/grass7/vector/v.class.ml/v.class.ml.py
===================================================================
--- grass-addons/grass7/vector/v.class.ml/v.class.ml.py 2014-07-02 03:54:30 UTC (rev 61109)
+++ grass-addons/grass7/vector/v.class.ml/v.class.ml.py 2014-07-02 07:36:18 UTC (rev 61110)
@@ -35,7 +35,7 @@
#% key: vlayer
#% type: string
#% multiple: no
-#% description: layer name or number to use for the machine learning
+#% description: layer name or number to use for data
#% required: no
#%end
#%option
@@ -120,7 +120,7 @@
#% key: imp_csv
#% type: string
#% multiple: no
-#% description: Feature importances with forests of trees: CSV
+#% description: CSV file name with the feature importances rank using extra tree algorithms
#% answer: features_importances.csv
#% required: no
#%end
@@ -128,7 +128,7 @@
#% key: imp_fig
#% type: string
#% multiple: no
-#% description: Feature importances with forests of trees: figure
+#% description: Figure file name with feature importances rank using extra tree algorithms
#% answer: features_importances.png
#% required: no
#%end
@@ -141,10 +141,18 @@
#% answer: with_mean,with_std
#%end
#%option
+#% key: decomposition
+#% type: string
+#% multiple: no
+#% description: choose a decomposition method (PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, FastICA, TruncatedSVD) and set the parameters using the | to separate the decomposition method from the parameters like: PCA|n_components=98
+#% required: no
+#% answer:
+#%end
+#%option
#% key: n_training
#% type: integer
#% multiple: no
-#% description: Number of random training to training the machine learning
+#% description: Number of random training per class to training the machine learning algorithms
#% required: no
#%end
#%option
@@ -165,7 +173,7 @@
#% key: pyindx
#% type: string
#% multiple: no
-#% description: specify the index of the classifiers that you want to use
+#% description: specify the index or range of index of the classifiers that you want to use
#% required: no
#%end
#%option
@@ -187,7 +195,7 @@
#% key: inf
#% type: string
#% multiple: yes
-#% description: Key:Value or Numpy funtion to use to substitute NaN values
+#% description: Key:Value or Numpy funtion to use to substitute Inf values
#% required: no
#% answer: *_skewness:nanmean,*_kurtosis:nanmean
#%end
@@ -195,7 +203,7 @@
#% key: neginf
#% type: string
#% multiple: yes
-#% description: Key:Value or Numpy funtion to use to substitute NaN values
+#% description: Key:Value or Numpy funtion to use to substitute neginf values
#% required: no
#% answer:
#%end
@@ -203,7 +211,7 @@
#% key: posinf
#% type: double
#% multiple: yes
-#% description: Key:Value or Numpy funtion to use to substitute NaN values
+#% description: Key:Value or Numpy funtion to use to substitute posinf values
#% required: no
#% answer:
#%end
@@ -219,7 +227,7 @@
#% key: report_class
#% type: string
#% multiple: no
-#% description: csv file name with results of different machine learning scores
+#% description: text file name with the report of different machine learning algorithms
#% required: no
#% answer: classification_report.txt
#%end
@@ -227,7 +235,7 @@
#% key: svc_c_range
#% type: double
#% multiple: yes
-#% description: C value list
+#% description: C value range list to explore SVC domain
#% required: no
#% answer: 1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8
#%end
@@ -235,7 +243,7 @@
#% key: svc_gamma_range
#% type: double
#% multiple: yes
-#% description: gamma value list
+#% description: gamma value range list to explore SVC domain
#% required: no
#% answer: 1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4
#%end
@@ -243,7 +251,7 @@
#% key: svc_kernel_range
#% type: string
#% multiple: yes
-#% description: kernel value list
+#% description: kernel value range list to explore SVC domain
#% required: no
#% answer: linear,poly,rbf,sigmoid
#%end
@@ -251,7 +259,7 @@
#% key: svc_n_jobs
#% type: integer
#% multiple: no
-#% description: number of jobs
+#% description: number of jobs to use during the domain exploration
#% required: no
#% answer: 1
#%end
@@ -259,21 +267,21 @@
#% key: svc_c
#% type: double
#% multiple: no
-#% description: C value
+#% description: definitive C value
#% required: no
#%end
#%option
#% key: svc_gamma
#% type: double
#% multiple: no
-#% description: gamma value
+#% description: definitive gamma value
#% required: no
#%end
#%option
#% key: svc_kernel
#% type: string
#% multiple: no
-#% description: Available kernel are: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
+#% description: definitive kernel value. Available kernel are: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
#% required: no
#% answer: rbf
#%end
@@ -281,7 +289,7 @@
#% key: svc_img
#% type: string
#% multiple: no
-#% description: filename with the image od SVC parameter
+#% description: filename pattern with the image of SVC parameter
#% required: no
#% answer: domain_%s.svg
#%end
@@ -289,7 +297,7 @@
#% key: rst_names
#% type: string
#% multiple: no
-#% description: filename with the image od SVC parameter
+#% description: filename pattern for raster
#% required: no
#% answer: %s
#%end
@@ -304,15 +312,15 @@
#%end
#%flag
#% key: f
-#% description: Feature importances with forests of trees
+#% description: Feature importances using extra trees algorithm
#%end
#%flag
#% key: b
-#% description: Balance the training using the class with the minor number of areas
+#% description: Balance the training using the class with the minor number of data
#%end
#%flag
#% key: o
-#% description: optimize the training samples
+#% description: Optimize the training samples
#%end
#%flag
#% key: c
@@ -320,7 +328,7 @@
#%end
#%flag
#% key: r
-#% description: Export the classify resutls to raster maps
+#% description: Export the classify results to raster maps
#%end
#%flag
#% key: t
@@ -328,9 +336,13 @@
#%end
#%flag
#% key: v
-#% description: Bias variance
+#% description: add to test to compute the Bias variance
#%end
#%flag
+#% key: x
+#% description: add to test to compute extra parameters like: confusion matrix, ROC, PR
+#%end
+#%flag
#% key: d
#% description: Explore the SVC domain
#%end
@@ -368,7 +380,11 @@
from npy2table import export_results
from features import importances, tocsv
+from sklearn.decomposition import (PCA, KernelPCA, ProbabilisticPCA,
+ RandomizedPCA, FastICA, TruncatedSVD)
+from sklearn.lda import LDA
+
RULES = {'*_skewness': np.nanmean,
'*_coeff_var': np.nanmean,
'*_stddev': np.nanmean,
@@ -379,6 +395,15 @@
'*_min': np.nanmin, }
+DECMP = {'PCA': PCA,
+ 'KernelPCA': KernelPCA,
+ 'ProbabilisticPCA': ProbabilisticPCA,
+ 'RandomizedPCA': RandomizedPCA,
+ 'FastICA': FastICA,
+ 'TruncatedSVD': TruncatedSVD,
+ 'LDA': LDA}
+
+
def get_indexes(string, sep=',', rangesep='-'):
"""
>>> indx = '1-5,34-36,40'
@@ -482,6 +507,14 @@
scapar = opt['scalar'].split(',')
scaler = StandardScaler(with_mean='with_mean' in scapar,
with_std='with_std' in scapar)
+
+ if opt['decomposition']:
+ decmp, params = (opt['decomposition'].split('|')
+ if '|' in opt['decomposition']
+ else (opt['decomposition'], ''))
+ kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))}
+ if params else {})
+ dec = DECMP[decmp](**kwargs)
# if training extract training
if vtraining and flg['e']:
msgr.message("Extract training from: <%s>." % vtraining)
@@ -580,6 +613,7 @@
C_range = [float(c) for c in opt['svc_c_range'].split(',')]
gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',')]
kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',')]
+ poly_range = [int(i) for i in opt['poly_range'].split(',')]
msgr.message("Exploring the SVC domain.")
grid = explore_SVC(Xbt, Ybt, n_folds=3, n_jobs=int(opt['svc_n_jobs']),
C=C_range, gamma=gamma_range, kernel=kernel_range)
@@ -595,8 +629,9 @@
msgr.message("Exploring different classifiers.")
msgr.message("cls_id cls_name mean max min std")
#import ipdb; ipdb.set_trace()
- res = explorer_clsfiers(classifiers, Xt, Yt,
- indexes=indexes, n_folds=5, bv=flg['v'])
+ res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels,
+ indexes=indexes, n_folds=5,
+ bv=flg['v'], extra=flg['x'])
# TODO: sort(order=...) is working only in the terminal, why?
#res.sort(order='mean')
with open(opt['csv_test_cls'], 'w') as csv:
More information about the grass-commit
mailing list