[GRASS-SVN] r70901 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Wed Apr 19 13:59:52 PDT 2017
Author: spawley
Date: 2017-04-19 13:59:52 -0700 (Wed, 19 Apr 2017)
New Revision: 70901
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
grass-addons/grass7/raster/r.learn.ml/raster_learning.py
Log:
r.learn.ml added option to export hyperparameter tuning results to a csv file
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-19 20:10:53 UTC (rev 70900)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-19 20:59:52 UTC (rev 70901)
@@ -53,24 +53,24 @@
#%option G_OPT_R_OUTPUT
#% key: output
-#% required: yes
#% label: Output Map
#% description: Prediction surface result from classification or regression model
+#% required: no
#%end
#%option string
#% key: classifier
-#% required: yes
#% label: Classifier
#% description: Supervised learning model to use
#% answer: RandomForestClassifier
#% options: LogisticRegression,LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis,GaussianNB,DecisionTreeClassifier,DecisionTreeRegressor,RandomForestClassifier,RandomForestRegressor,ExtraTreesClassifier,ExtraTreesRegressor,GradientBoostingClassifier,GradientBoostingRegressor,SVC,EarthClassifier,EarthRegressor,XGBClassifier,XGBRegressor
+#% required: no
#%end
#%option
#% key: c
#% type: double
-#% description: Inverse of regularization strength (logistic regresson and SVC)
+#% description: Inverse of regularization strength
#% answer: 1.0
#% multiple: yes
#% guisection: Classifier Parameters
@@ -79,8 +79,7 @@
#%option
#% key: max_features
#% type: integer
-#% description: Number of features to consider during splitting for tree based classifiers. Default is sqrt(n_features) for classification, and n_features for regression
-#% required: no
+#% description: Number of features avaiable during node splitting
#% answer:0
#% multiple: yes
#% guisection: Classifier Parameters
@@ -89,8 +88,7 @@
#%option
#% key: max_depth
#% type: integer
-#% description: Optionally specifiy maximum tree depth. Otherwise full-growing occurs for decision trees and random forests, and max_depth=3 for gradient boosting
-#% required: no
+#% description: Maximum tree depth; zero uses classifier defaults
#% answer:0
#% multiple: yes
#% guisection: Classifier Parameters
@@ -99,7 +97,7 @@
#%option
#% key: min_samples_split
#% type: integer
-#% description: The minimum number of samples required for node splitting in tree based classifiers
+#% description: The minimum number of samples required for node splitting
#% answer: 2
#% multiple: yes
#% guisection: Classifier Parameters
@@ -108,7 +106,7 @@
#%option
#% key: min_samples_leaf
#% type: integer
-#% description: The minimum number of samples required to form a leaf node for tree based classifiers
+#% description: The minimum number of samples required to form a leaf node
#% answer: 1
#% multiple: yes
#% guisection: Classifier Parameters
@@ -117,7 +115,7 @@
#%option
#% key: n_estimators
#% type: integer
-#% description: Number of estimators for tree-based classifiers
+#% description: Number of estimators
#% answer: 100
#% multiple: yes
#% guisection: Classifier Parameters
@@ -126,7 +124,7 @@
#%option
#% key: learning_rate
#% type: double
-#% description: learning rate for gradient boosting
+#% description: learning rate
#% answer: 0.1
#% multiple: yes
#% guisection: Classifier Parameters
@@ -135,7 +133,7 @@
#%option
#% key: subsample
#% type: double
-#% description: The fraction of samples to be used for fitting for gradient boosting
+#% description: The fraction of samples to be used for fitting
#% answer: 1.0
#% multiple: yes
#% guisection: Classifier Parameters
@@ -143,27 +141,14 @@
#%option integer
#% key: max_degree
-#% description: The maximum degree of terms generated by the forward pass in Earth
+#% description: The maximum degree of terms in forward pass
#% answer: 1
#% multiple: yes
#% guisection: Classifier Parameters
#%end
-#%flag
-#% key: s
-#% label: Standardization preprocessing
-#% guisection: Optional
-#%end
-
-#%flag
-#% key: i
-#% label: Impute missing values in training data
-#% guisection: Optional
-#%end
-
#%option integer
#% key: categorymaps
-#% required: no
#% multiple: yes
#% label: Indices of categorical rasters within the imagery group (0..n)
#% description: Indices of categorical rasters within the imagery group (0..n)
@@ -171,7 +156,6 @@
#%option string
#% key: cvtype
-#% required: no
#% label: Non-spatial or spatial cross-validation
#% description: Non-spatial, clumped or clustered k-fold cross-validation
#% answer: Non-spatial
@@ -197,7 +181,7 @@
#%option
#% key: cv
#% type: integer
-#% description: Number of cross-validation folds for performance evaluation
+#% description: Number of cross-validation folds
#% answer: 1
#% guisection: Optional
#%end
@@ -205,18 +189,11 @@
#%option
#% key: random_state
#% type: integer
-#% description: Seed to pass onto the random state for reproducible results
+#% description: Seed to use for random state
#% answer: 1
#% guisection: Optional
#%end
-#%option G_OPT_F_OUTPUT
-#% key: errors_file
-#% label: Save cross-validation global accuracy results to csv
-#% required: no
-#% guisection: Optional
-#%end
-
#%option
#% key: lines
#% type: integer
@@ -225,7 +202,44 @@
#% guisection: Optional
#%end
+#%option
+#% key: indexes
+#% type: integer
+#% description: Indexes of class probabilities to predict. Default -1 predicts all classes
+#% answer: -1
+#% guisection: Optional
+#% multiple: yes
+#%end
+
+#%option
+#% key: n_permutations
+#% type: integer
+#% description: Number of permutations to perform for feature importances
+#% answer: 50
+#% guisection: Optional
+#%end
+
+#%option
+#% key: n_jobs
+#% type: integer
+#% description: Number of cores for multiprocessing, -2 is n_cores-1
+#% answer: -2
+#% guisection: Optional
+#%end
+
#%flag
+#% key: s
+#% label: Standardization preprocessing
+#% guisection: Optional
+#%end
+
+#%flag
+#% key: i
+#% label: Impute missing values in training data
+#% guisection: Optional
+#%end
+
+#%flag
#% key: p
#% label: Output class membership probabilities
#% guisection: Optional
@@ -244,38 +258,47 @@
#%end
#%flag
+#% key: t
+#% description: Perform hyperparameter tuning only
+#% guisection: Optional
+#%end
+
+#%flag
#% key: f
#% description: Calculate feature importances using permutation
#% guisection: Optional
#%end
-#%option
-#% key: indexes
-#% type: integer
-#% description: Indexes of class probabilities to predict. Default -1 predicts all classes
-#% answer: -1
+#%flag
+#% key: b
+#% description: Balance training data using class weights
#% guisection: Optional
-#% multiple: yes
#%end
-#%option
-#% key: n_permutations
-#% type: integer
-#% description: Number of permutations to perform for feature importances
-#% answer: 10
+#%flag
+#% key: l
+#% label: Use memory swap
#% guisection: Optional
#%end
#%option G_OPT_F_OUTPUT
+#% key: errors_file
+#% label: Save cross-validation global accuracy results to csv
+#% required: no
+#% guisection: Optional
+#%end
+
+#%option G_OPT_F_OUTPUT
#% key: fimp_file
#% label: Save feature importances to csv
#% required: no
#% guisection: Optional
#%end
-#%flag
-#% key: b
-#% description: Balance training data using class weights
+#%option G_OPT_F_OUTPUT
+#% key: param_file
+#% label: Save hyperparameter search scores to csv
+#% required: no
#% guisection: Optional
#%end
@@ -307,12 +330,6 @@
#% guisection: Optional
#%end
-#%flag
-#% key: l
-#% label: Use memory swap
-#% guisection: Optional
-#%end
-
#%rules
#% exclusive: trainingmap,load_model
#% exclusive: load_training,save_training
@@ -350,11 +367,18 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
+ from sklearn import metrics
+ from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore') # turn off UndefinedMetricWarning
except:
grass.fatal("Scikit learn 0.18 or newer is not installed")
+ try:
+ import pandas as pd
+ except:
+ grass.fatal("Pandas is not installed")
+
group = options['group']
trainingmap = options['trainingmap']
trainingpoints = options['trainingpoints']
@@ -371,6 +395,7 @@
n_partitions = int(options['n_partitions'])
modelonly = flags['m']
probability = flags['p']
+ tuneonly = flags['t']
rowincr = int(options['lines'])
random_state = int(options['random_state'])
model_save = options['save_model']
@@ -386,11 +411,13 @@
if indexes == [-1]:
indexes = None
n_permutations = int(options['n_permutations'])
+ n_jobs = int(options['n_jobs'])
lowmem = flags['l']
impute = flags['i']
prob_only = flags['z']
errors_file = options['errors_file']
fimp_file = options['fimp_file']
+ param_file = options['param_file']
balance = flags['b']
if balance is True:
balance = 'balanced'
@@ -430,7 +457,7 @@
# retrieve sklearn classifier object and parameters
clf, mode = model_classifiers(
- classifier, random_state, hyperparams, balance)
+ classifier, random_state, n_jobs, hyperparams, balance)
# remove dict keys that are incompatible for the selected classifier
clf_params = clf.get_params()
@@ -440,9 +467,11 @@
# scoring metrics
if mode == 'classification':
- scoring = ['accuracy', 'precision', 'recall', 'f1', 'kappa', 'balanced_accuracy']
+ scoring = ['matthews_corrcoef', 'accuracy', 'precision', 'recall', 'f1', 'kappa', 'balanced_accuracy']
+ search_scorer = make_scorer(metrics.cohen_kappa_score)
else:
scoring = ['r2', 'neg_mean_squared_error']
+ search_scorer = 'r2'
# Sample training data and group ids
# ----------------------------------
@@ -501,7 +530,7 @@
if cvtype == 'kmeans':
clusters = KMeans(
n_clusters=n_partitions,
- random_state=random_state, n_jobs=-1)
+ random_state=random_state, n_jobs=n_jobs)
clusters.fit(sample_coords)
group_id = clusters.labels_
@@ -592,8 +621,8 @@
# create grid search method
clf = GridSearchCV(
- estimator=clf, param_grid=param_grid, scoring=scoring[0],
- n_jobs=-1, cv=resampling)
+ estimator=clf, param_grid=param_grid, scoring=search_scorer,
+ n_jobs=n_jobs, cv=resampling)
# classifier training
# -------------------
@@ -620,12 +649,15 @@
grass.message(os.linesep)
grass.message('Best parameters:')
grass.message(str(clf.best_params_))
+ if param_file != '':
+ param_df = pd.DataFrame(clf.cv_results_)
+ param_df.to_csv(param_file)
# cross-validation
# -----------------
# If cv > 1 then use cross-validation to generate performance measures
- if cv > 1:
+ if cv > 1 and tuneonly is not True :
if mode == 'classification' and cv > np.histogram(
y, bins=len(np.unique(y)))[0].min():
grass.message(os.linesep)
@@ -668,14 +700,8 @@
# write cross-validation results for csv file
if errors_file != '':
- try:
- import pandas as pd
- errors = pd.DataFrame(scores)
- errors.to_csv(errors_file, mode='w')
- except:
- grass.warning('Pandas is not installed. Pandas is '
- 'required to write the cross-validation '
- 'results to file')
+ errors = pd.DataFrame(scores)
+ errors.to_csv(errors_file, mode='w')
# feature importances
if importances is True:
Modified: grass-addons/grass7/raster/r.learn.ml/raster_learning.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-04-19 20:10:53 UTC (rev 70900)
+++ grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-04-19 20:59:52 UTC (rev 70901)
@@ -2,6 +2,7 @@
import numpy as np
from numpy.random import RandomState
import tempfile
+import itertools
from copy import deepcopy
import grass.script as grass
from grass.pygrass.raster import RasterRow
@@ -237,7 +238,7 @@
# metrics that have no averaging for multiclass
elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
or m == 'hamming_loss' or m == 'jaccard_similarity' \
- or m == 'log_loss' or m == 'zero_one_loss':
+ or m == 'log_loss' or m == 'zero_one_loss' or m == 'matthews_corrcoef':
scores[m] = np.append(
scores[m], scoring_methods[m](y_test, y_pred))
@@ -412,7 +413,7 @@
pass
-def model_classifiers(estimator, random_state, p, weights=None):
+def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
"""
Provides the classifiers and parameters using by the module
@@ -421,6 +422,7 @@
----
estimator: Name of estimator
random_state: Seed to use in randomized components
+ n_jobs: Integer, number of processing cores to use
p: Dict, containing classifier setttings
weights: None, or 'balanced' to add class_weights
@@ -451,7 +453,7 @@
earth_classifier = Pipeline([('Earth',
Earth(max_degree=p['max_degree'])),
- ('Logistic', LogisticRegression())])
+ ('Logistic', LogisticRegression(n_jobs=n_jobs))])
classifiers = {'EarthClassifier': earth_classifier,
'EarthRegressor': Earth(max_degree=p['max_degree'])}
@@ -470,12 +472,14 @@
XGBClassifier(learning_rate=p['learning_rate'],
n_estimators=p['n_estimators'],
max_depth=p['max_depth'],
- subsample=p['subsample']),
+ subsample=p['subsample'],
+ nthread=n_jobs),
'XGBRegressor':
XGBRegressor(learning_rate=p['learning_rate'],
n_estimators=p['n_estimators'],
max_depth=p['max_depth'],
- subsample=p['subsample'])}
+ subsample=p['subsample'],
+ nthread=n_jobs)}
except:
grass.fatal('XGBoost package not installed')
else:
@@ -489,7 +493,7 @@
LogisticRegression(C=p['C'],
class_weight=weights,
random_state=random_state,
- n_jobs=-1,
+ n_jobs=n_jobs,
fit_intercept=True),
'DecisionTreeClassifier':
DecisionTreeClassifier(max_depth=p['max_depth'],
@@ -510,7 +514,7 @@
min_samples_leaf=p['min_samples_leaf'],
class_weight=weights,
random_state=random_state,
- n_jobs=-1,
+ n_jobs=n_jobs,
oob_score=False),
'RandomForestRegressor':
RandomForestRegressor(n_estimators=p['n_estimators'],
@@ -518,7 +522,7 @@
min_samples_split=p['min_samples_split'],
min_samples_leaf=p['min_samples_leaf'],
random_state=random_state,
- n_jobs=-1,
+ n_jobs=n_jobs,
oob_score=False),
'ExtraTreesClassifier':
ExtraTreesClassifier(n_estimators=p['n_estimators'],
@@ -527,7 +531,7 @@
min_samples_leaf=p['min_samples_leaf'],
class_weight=weights,
random_state=random_state,
- n_jobs=-1,
+ n_jobs=n_jobs,
oob_score=False),
'ExtraTreesRegressor':
ExtraTreesRegressor(n_estimators=p['n_estimators'],
@@ -535,7 +539,7 @@
min_samples_split=p['min_samples_split'],
min_samples_leaf=p['min_samples_leaf'],
random_state=random_state,
- n_jobs=-1,
+ n_jobs=n_jobs,
oob_score=False),
'GradientBoostingClassifier':
More information about the grass-commit
mailing list