[GRASS-SVN] r70901 - grass-addons/grass7/raster/r.learn.ml

Wed Apr 19 13:59:52 PDT 2017

Author: spawley
Date: 2017-04-19 13:59:52 -0700 (Wed, 19 Apr 2017)
New Revision: 70901

Modified:
   grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
   grass-addons/grass7/raster/r.learn.ml/raster_learning.py
Log:
r.learn.ml added option to export hyperparameter tuning results to a csv file

Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================

--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py	2017-04-19 20:10:53 UTC (rev 70900)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py	2017-04-19 20:59:52 UTC (rev 70901)
@@ -53,24 +53,24 @@
 
 #%option G_OPT_R_OUTPUT
 #% key: output
-#% required: yes
 #% label: Output Map
 #% description: Prediction surface result from classification or regression model
+#% required: no
 #%end
 
 #%option string
 #% key: classifier
-#% required: yes
 #% label: Classifier
 #% description: Supervised learning model to use
 #% answer: RandomForestClassifier
 #% options: LogisticRegression,LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis,GaussianNB,DecisionTreeClassifier,DecisionTreeRegressor,RandomForestClassifier,RandomForestRegressor,ExtraTreesClassifier,ExtraTreesRegressor,GradientBoostingClassifier,GradientBoostingRegressor,SVC,EarthClassifier,EarthRegressor,XGBClassifier,XGBRegressor
+#% required: no
 #%end
 
 #%option
 #% key: c
 #% type: double
-#% description: Inverse of regularization strength (logistic regresson and SVC)
+#% description: Inverse of regularization strength
 #% answer: 1.0
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -79,8 +79,7 @@
 #%option
 #% key: max_features
 #% type: integer
-#% description: Number of features to consider during splitting for tree based classifiers. Default is sqrt(n_features) for classification, and n_features for regression
-#% required: no
+#% description: Number of features avaiable during node splitting
 #% answer:0
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -89,8 +88,7 @@
 #%option
 #% key: max_depth
 #% type: integer
-#% description: Optionally specifiy maximum tree depth. Otherwise full-growing occurs for decision trees and random forests, and max_depth=3 for gradient boosting
-#% required: no
+#% description: Maximum tree depth; zero uses classifier defaults
 #% answer:0
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -99,7 +97,7 @@
 #%option
 #% key: min_samples_split
 #% type: integer
-#% description: The minimum number of samples required for node splitting in tree based classifiers
+#% description: The minimum number of samples required for node splitting
 #% answer: 2
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -108,7 +106,7 @@
 #%option
 #% key: min_samples_leaf
 #% type: integer
-#% description: The minimum number of samples required to form a leaf node for tree based classifiers
+#% description: The minimum number of samples required to form a leaf node
 #% answer: 1
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -117,7 +115,7 @@
 #%option
 #% key: n_estimators
 #% type: integer
-#% description: Number of estimators for tree-based classifiers
+#% description: Number of estimators
 #% answer: 100
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -126,7 +124,7 @@
 #%option
 #% key: learning_rate
 #% type: double
-#% description: learning rate for gradient boosting
+#% description: learning rate
 #% answer: 0.1
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -135,7 +133,7 @@
 #%option
 #% key: subsample
 #% type: double
-#% description: The fraction of samples to be used for fitting for gradient boosting
+#% description: The fraction of samples to be used for fitting
 #% answer: 1.0
 #% multiple: yes
 #% guisection: Classifier Parameters
@@ -143,27 +141,14 @@
 
 #%option integer
 #% key: max_degree
-#% description: The maximum degree of terms generated by the forward pass in Earth
+#% description: The maximum degree of terms in forward pass
 #% answer: 1
 #% multiple: yes
 #% guisection: Classifier Parameters
 #%end
 
-#%flag
-#% key: s
-#% label: Standardization preprocessing
-#% guisection: Optional
-#%end
-
-#%flag
-#% key: i
-#% label: Impute missing values in training data
-#% guisection: Optional
-#%end
-
 #%option integer
 #% key: categorymaps
-#% required: no
 #% multiple: yes
 #% label: Indices of categorical rasters within the imagery group (0..n)
 #% description: Indices of categorical rasters within the imagery group (0..n)
@@ -171,7 +156,6 @@
 
 #%option string
 #% key: cvtype
-#% required: no
 #% label: Non-spatial or spatial cross-validation
 #% description: Non-spatial, clumped or clustered k-fold cross-validation
 #% answer: Non-spatial
@@ -197,7 +181,7 @@
 #%option
 #% key: cv
 #% type: integer
-#% description: Number of cross-validation folds for performance evaluation
+#% description: Number of cross-validation folds
 #% answer: 1
 #% guisection: Optional
 #%end
@@ -205,18 +189,11 @@
 #%option
 #% key: random_state
 #% type: integer
-#% description: Seed to pass onto the random state for reproducible results
+#% description: Seed to use for random state
 #% answer: 1
 #% guisection: Optional
 #%end
 
-#%option G_OPT_F_OUTPUT
-#% key: errors_file
-#% label: Save cross-validation global accuracy results to csv
-#% required: no
-#% guisection: Optional
-#%end
-
 #%option
 #% key: lines
 #% type: integer
@@ -225,7 +202,44 @@
 #% guisection: Optional
 #%end
 
+#%option
+#% key: indexes
+#% type: integer
+#% description: Indexes of class probabilities to predict. Default -1 predicts all classes
+#% answer: -1
+#% guisection: Optional
+#% multiple: yes
+#%end
+
+#%option
+#% key: n_permutations
+#% type: integer
+#% description: Number of permutations to perform for feature importances
+#% answer: 50
+#% guisection: Optional
+#%end
+
+#%option
+#% key: n_jobs
+#% type: integer
+#% description: Number of cores for multiprocessing, -2 is n_cores-1
+#% answer: -2
+#% guisection: Optional
+#%end
+
 #%flag
+#% key: s
+#% label: Standardization preprocessing
+#% guisection: Optional
+#%end
+
+#%flag
+#% key: i
+#% label: Impute missing values in training data
+#% guisection: Optional
+#%end
+
+#%flag
 #% key: p
 #% label: Output class membership probabilities
 #% guisection: Optional
@@ -244,38 +258,47 @@
 #%end
 
 #%flag
+#% key: t
+#% description: Perform hyperparameter tuning only
+#% guisection: Optional
+#%end
+
+#%flag
 #% key: f
 #% description: Calculate feature importances using permutation
 #% guisection: Optional
 #%end
 
-#%option
-#% key: indexes
-#% type: integer
-#% description: Indexes of class probabilities to predict. Default -1 predicts all classes
-#% answer: -1
+#%flag
+#% key: b
+#% description: Balance training data using class weights
 #% guisection: Optional
-#% multiple: yes
 #%end
 
-#%option
-#% key: n_permutations
-#% type: integer
-#% description: Number of permutations to perform for feature importances
-#% answer: 10
+#%flag
+#% key: l
+#% label: Use memory swap
 #% guisection: Optional
 #%end
 
 #%option G_OPT_F_OUTPUT
+#% key: errors_file
+#% label: Save cross-validation global accuracy results to csv
+#% required: no
+#% guisection: Optional
+#%end
+
+#%option G_OPT_F_OUTPUT
 #% key: fimp_file
 #% label: Save feature importances to csv
 #% required: no
 #% guisection: Optional
 #%end
 
-#%flag
-#% key: b
-#% description: Balance training data using class weights
+#%option G_OPT_F_OUTPUT
+#% key: param_file
+#% label: Save hyperparameter search scores to csv
+#% required: no
 #% guisection: Optional
 #%end
 
@@ -307,12 +330,6 @@
 #% guisection: Optional
 #%end
 
-#%flag
-#% key: l
-#% label: Use memory swap
-#% guisection: Optional
-#%end
-
 #%rules
 #% exclusive: trainingmap,load_model
 #% exclusive: load_training,save_training
@@ -350,11 +367,18 @@
         from sklearn.preprocessing import OneHotEncoder
         from sklearn.pipeline import Pipeline
         from sklearn.utils import shuffle
+        from sklearn import metrics
+        from sklearn.metrics import make_scorer
         import warnings
         warnings.filterwarnings('ignore')  # turn off UndefinedMetricWarning
     except:
         grass.fatal("Scikit learn 0.18 or newer is not installed")
 
+    try:
+        import pandas as pd
+    except:
+        grass.fatal("Pandas is not installed")
+
     group = options['group']
     trainingmap = options['trainingmap']
     trainingpoints = options['trainingpoints']
@@ -371,6 +395,7 @@
     n_partitions = int(options['n_partitions'])
     modelonly = flags['m']
     probability = flags['p']
+    tuneonly = flags['t']
     rowincr = int(options['lines'])
     random_state = int(options['random_state'])
     model_save = options['save_model']
@@ -386,11 +411,13 @@
     if indexes == [-1]:
         indexes = None
     n_permutations = int(options['n_permutations'])
+    n_jobs = int(options['n_jobs'])
     lowmem = flags['l']
     impute = flags['i']
     prob_only = flags['z']
     errors_file = options['errors_file']
     fimp_file = options['fimp_file']
+    param_file = options['param_file']
     balance = flags['b']
     if balance is True:
         balance = 'balanced'
@@ -430,7 +457,7 @@
 
     # retrieve sklearn classifier object and parameters
     clf, mode = model_classifiers(
-        classifier, random_state, hyperparams, balance)
+        classifier, random_state, n_jobs, hyperparams, balance)
 
     # remove dict keys that are incompatible for the selected classifier
     clf_params = clf.get_params()
@@ -440,9 +467,11 @@
 
     # scoring metrics
     if mode == 'classification':
-        scoring = ['accuracy', 'precision', 'recall', 'f1', 'kappa', 'balanced_accuracy']
+        scoring = ['matthews_corrcoef', 'accuracy', 'precision', 'recall', 'f1', 'kappa', 'balanced_accuracy']
+        search_scorer = make_scorer(metrics.cohen_kappa_score)
     else:
         scoring = ['r2', 'neg_mean_squared_error']
+        search_scorer = 'r2'
 
     # Sample training data and group ids
     # ----------------------------------
@@ -501,7 +530,7 @@
                 if cvtype == 'kmeans':
                     clusters = KMeans(
                         n_clusters=n_partitions,
-                        random_state=random_state, n_jobs=-1)
+                        random_state=random_state, n_jobs=n_jobs)
 
                     clusters.fit(sample_coords)
                     group_id = clusters.labels_
@@ -592,8 +621,8 @@
 
             # create grid search method
             clf = GridSearchCV(
-                estimator=clf, param_grid=param_grid, scoring=scoring[0],
-                n_jobs=-1, cv=resampling)
+                estimator=clf, param_grid=param_grid, scoring=search_scorer,
+                n_jobs=n_jobs, cv=resampling)
 
         # classifier training
         # -------------------
@@ -620,12 +649,15 @@
             grass.message(os.linesep)
             grass.message('Best parameters:')
             grass.message(str(clf.best_params_))
+            if param_file != '':
+                param_df = pd.DataFrame(clf.cv_results_)
+                param_df.to_csv(param_file)
 
         # cross-validation
         # -----------------
 
         # If cv > 1 then use cross-validation to generate performance measures
-        if cv > 1:
+        if cv > 1 and tuneonly is not True :
             if mode == 'classification' and cv > np.histogram(
                     y, bins=len(np.unique(y)))[0].min():
                 grass.message(os.linesep)
@@ -668,14 +700,8 @@
 
                 # write cross-validation results for csv file
                 if errors_file != '':
-                    try:
-                        import pandas as pd
-                        errors = pd.DataFrame(scores)
-                        errors.to_csv(errors_file, mode='w')
-                    except:
-                        grass.warning('Pandas is not installed. Pandas is '
-                                      'required to write the cross-validation '
-                                      'results to file')
+                    errors = pd.DataFrame(scores)
+                    errors.to_csv(errors_file, mode='w')
 
                 # feature importances
                 if importances is True:

Modified: grass-addons/grass7/raster/r.learn.ml/raster_learning.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/raster_learning.py	2017-04-19 20:10:53 UTC (rev 70900)
+++ grass-addons/grass7/raster/r.learn.ml/raster_learning.py	2017-04-19 20:59:52 UTC (rev 70901)
@@ -2,6 +2,7 @@
 import numpy as np
 from numpy.random import RandomState
 import tempfile
+import itertools
 from copy import deepcopy
 import grass.script as grass
 from grass.pygrass.raster import RasterRow
@@ -237,7 +238,7 @@
             # metrics that have no averaging for multiclass
             elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
             or m == 'hamming_loss' or m == 'jaccard_similarity' \
-            or m == 'log_loss' or m == 'zero_one_loss':
+            or m == 'log_loss' or m == 'zero_one_loss' or m == 'matthews_corrcoef':
                 scores[m] = np.append(
                     scores[m], scoring_methods[m](y_test, y_pred))
 
@@ -412,7 +413,7 @@
             pass
 
 
-def model_classifiers(estimator, random_state, p, weights=None):
+def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
 
     """
     Provides the classifiers and parameters using by the module
@@ -421,6 +422,7 @@
     ----
     estimator: Name of estimator
     random_state: Seed to use in randomized components
+    n_jobs: Integer, number of processing cores to use
     p: Dict, containing classifier setttings
     weights: None, or 'balanced' to add class_weights
 
@@ -451,7 +453,7 @@
 
             earth_classifier = Pipeline([('Earth',
                                           Earth(max_degree=p['max_degree'])),
-                                         ('Logistic', LogisticRegression())])
+                                         ('Logistic', LogisticRegression(n_jobs=n_jobs))])
 
             classifiers = {'EarthClassifier': earth_classifier,
                            'EarthRegressor': Earth(max_degree=p['max_degree'])}
@@ -470,12 +472,14 @@
                     XGBClassifier(learning_rate=p['learning_rate'],
                                   n_estimators=p['n_estimators'],
                                   max_depth=p['max_depth'],
-                                  subsample=p['subsample']),
+                                  subsample=p['subsample'],
+                                  nthread=n_jobs),
                 'XGBRegressor':
                     XGBRegressor(learning_rate=p['learning_rate'],
                                  n_estimators=p['n_estimators'],
                                  max_depth=p['max_depth'],
-                                 subsample=p['subsample'])}
+                                 subsample=p['subsample'],
+                                 nthread=n_jobs)}
         except:
             grass.fatal('XGBoost package not installed')
     else:
@@ -489,7 +493,7 @@
                 LogisticRegression(C=p['C'],
                                    class_weight=weights,
                                    random_state=random_state,
-                                   n_jobs=-1,
+                                   n_jobs=n_jobs,
                                    fit_intercept=True),
             'DecisionTreeClassifier':
                 DecisionTreeClassifier(max_depth=p['max_depth'],
@@ -510,7 +514,7 @@
                                        min_samples_leaf=p['min_samples_leaf'],
                                        class_weight=weights,
                                        random_state=random_state,
-                                       n_jobs=-1,
+                                       n_jobs=n_jobs,
                                        oob_score=False),
             'RandomForestRegressor':
                 RandomForestRegressor(n_estimators=p['n_estimators'],
@@ -518,7 +522,7 @@
                                       min_samples_split=p['min_samples_split'],
                                       min_samples_leaf=p['min_samples_leaf'],
                                       random_state=random_state,
-                                      n_jobs=-1,
+                                      n_jobs=n_jobs,
                                       oob_score=False),
             'ExtraTreesClassifier':
                 ExtraTreesClassifier(n_estimators=p['n_estimators'],
@@ -527,7 +531,7 @@
                                      min_samples_leaf=p['min_samples_leaf'],
                                      class_weight=weights,
                                      random_state=random_state,
-                                     n_jobs=-1,
+                                     n_jobs=n_jobs,
                                      oob_score=False),
             'ExtraTreesRegressor':
                 ExtraTreesRegressor(n_estimators=p['n_estimators'],
@@ -535,7 +539,7 @@
                                     min_samples_split=p['min_samples_split'],
                                     min_samples_leaf=p['min_samples_leaf'],
                                     random_state=random_state,
-                                    n_jobs=-1,
+                                    n_jobs=n_jobs,
                                     oob_score=False),
 
             'GradientBoostingClassifier':