[GRASS-SVN] r69984 - grass-addons/grass7/raster/r.randomforest

Sat Dec 3 09:41:29 PST 2016

Author: spawley
Date: 2016-12-03 09:41:29 -0800 (Sat, 03 Dec 2016)
New Revision: 69984

Modified:
   grass-addons/grass7/raster/r.randomforest/ml_utils.py
   grass-addons/grass7/raster/r.randomforest/r.randomforest.html
   grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
bug fix to r.randomforest

Modified: grass-addons/grass7/raster/r.randomforest/ml_utils.py
===================================================================

--- grass-addons/grass7/raster/r.randomforest/ml_utils.py	2016-12-03 04:41:46 UTC (rev 69983)
+++ grass-addons/grass7/raster/r.randomforest/ml_utils.py	2016-12-03 17:41:29 UTC (rev 69984)
@@ -1,6 +1,7 @@
 import numpy as np
 import grass.script as grass
 import tempfile
+import copy
 from grass.pygrass.raster import RasterRow
 from grass.pygrass.gis.region import Region
 from grass.pygrass.raster.buffer import Buffer
@@ -9,7 +10,6 @@
 from sklearn.model_selection import GroupKFold
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
-from sklearn import preprocessing
 from sklearn.feature_selection import SelectKBest
 from sklearn.feature_selection import f_classif
 from sklearn.utils import shuffle
@@ -387,8 +387,11 @@
         y_pred_agg = np.append(y_pred_agg, y_pred)
 
         # calculate metrics
-        scores['accuracy'] = np.append(
-            scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+        try:
+            scores['accuracy'] = np.append(
+                scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+        except:
+            pass
 
         scores['r2'] = np.append(
             scores['r2'], metrics.r2_score(y_test, y_pred))
@@ -406,7 +409,7 @@
     return(scores, y_test_agg, y_pred_agg)
 
 
-def tune_split(X, y, Id, estimator, params, test_size, random_state):
+def tune_split(X, y, Id, estimator, metric, params, test_size, random_state):
 
     if Id is None:
         X, X_devel, y, y_devel = train_test_split(X, y, test_size=test_size,
@@ -417,7 +420,7 @@
                             random_state=random_state, stratify=Id)
 
     clf = GridSearchCV(estimator=estimator, cv=3, param_grid=params,
-                              scoring="accuracy", n_jobs=-1)
+                              scoring=metric, n_jobs=-1)
     
     clf.fit(X_devel, y_devel)
 
@@ -426,25 +429,17 @@
 
 def feature_importances(clf, X, y):
 
-    min_max_scaler = preprocessing.MinMaxScaler()
-
     try:
-        clfimp = min_max_scaler.fit_transform(
-                    clf.feature_importances_.reshape(-1, 1))
+        clfimp = clf.feature_importances_
     except:
-        try:
-            clfimp = min_max_scaler.fit_transform(
-                        abs(clf.coef_.T).reshape(-1, 1))
-        except:
-            sk = SelectKBest(f_classif, k='all')
-            sk_fit = sk.fit(X, y)
-            clfimp = min_max_scaler.fit_transform(
-                        sk_fit.scores_.reshape(-1, 1))
+        sk = SelectKBest(f_classif, k='all')
+        sk_fit = sk.fit(X, y)
+        clfimp = sk_fit.scores_
 
     return (clfimp)
 
 
-def sample_training_data(roi, maplist, cv, cvtype, model_load, model_save,
+def sample_training_data(roi, maplist, cv, cvtype, model_load,
                          load_training, save_training, lowmem, random_state):
     
     # load the model or training data
@@ -460,17 +455,18 @@
             # create clumped roi for spatial cross validation
             if cv > 1 and cvtype == 'clumped':
                 r.clump(input=roi, output='tmp_roi_clumped', overwrite=True, quiet=True)
-                maplist2 = maplist
+                maplist2 = copy.deepcopy(maplist)
                 maplist2.append('tmp_roi_clumped')
                 X, y, sample_coords = sample_predictors(response=roi,
                                                         predictors=maplist2,
                                                         shuffle_data=False,
-                                                        lowmem=lowmem)
+                                                        lowmem=lowmem,
+                                                        random_state=random_state)
                  # take Id from last column
-                Id = X[:, X.shape[1]-1]
+                Id = X[:, -1]
 
                 # remove Id column from predictors
-                X = X[:, 0:X.shape[1]]
+                X = X[:, 0:X.shape[1]-1]
             else:
                 # query predictor rasters with training features
                 Id = None
@@ -487,8 +483,6 @@
 
             if save_training != '':
                 save_training_data(X, y, Id, save_training)
-                
-            if model_save != '':
-                save_training_data(X, y, Id, model_save + ".csv")
 
+    
     return (X, y, Id, clf)

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.html
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.html	2016-12-03 04:41:46 UTC (rev 69983)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.html	2016-12-03 17:41:29 UTC (rev 69984)
@@ -9,7 +9,7 @@
 The Classifier parameters tab provides access to the most pertinent parameters that affect the previously described algorithms. <i>C</i> is the inverse of the regularization strength, which is when a penalty is applied to avoid overfitting. <i>C</i> applies to the LogisticRegression and SVC models. Most of the other parameters apply to the tree and ensemble-tree based classifiers. <i>n_estimators</i> represents the number of trees in Random Forest model, and the number of trees used in each model step during Gradient Boosting. <i>max_features</i> controls the number of variables that are allowed to be chosen from at each node split in the tree-based models, and can be considered to control the degree of correlation between the trees in ensemble tree methods. <i>min_samples_split</i> and <i>min_samples_leaf</i> control the number of samples required to split a node, or form a leaf node, respectively. The <i>learning_rate</i> and <i>subsample</i> parameters apply only to Gradient Boos
 ting. <i>learning_rate</i> shrinks the contribution of each tree, and <i>subsample</i> is the fraction of randomly selected samples for each tree, and values of &lt 1 reduce the model variance resulting in Stochastic Gradient Boosting. 
 
 <p>
-In addition to model fitting and prediction, <em><b>r.randomforest</b></em> can be used for feature selection using the <i>f</i> flag. The linear model classifiers (LogisticRegression, SVC) provide fit coefficients which can be used to evaluate the importance of each of the predictors. Furthermore, the tree-based classifiers include an intrisic measure of variable importance based on the relative rank (depth) of a feature used as a decision node in a tree. If the classifier provides neither of these methods as is the case with LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis and GaussianNB, then univariate feature selection is used to provide feature importance scores. 
+In addition to model fitting and prediction, <em><b>r.randomforest</b></em> can be used for feature selection using the <i>f</i> flag. The tree-based classifiers include an intrisic measure of variable importance based on the relative rank (depth) of a feature used as a decision node in a tree. For other classifiers, univariate feature selection is used to provide feature importance scores. 
 
 <p>
 Cross validation can be performed by setting the <i>cv</i> parameters to > 1. Cross-validation is performed using stratified kfolds, and multiple global and per-class accuracy measures are produced. Also note that this cross-validation is performed on a pixel basis. If there is a strong autocorrelation between pixels (i.e. the pixels represent polygons) then the training/test splits will not represent independent samples and will overestimate the accuracy. In this case, the <i>cvtype</i> parameter can be changed from 'non-spatial' to either 'clumped' or 'kmeans' to perform spatial cross-validation. Clumped spatial cross-validation is used if the training pixels represent polygons, and then cross-validation will be effectively performed on a polygon basis. Kmeans spatial cross-validation will partition the training pixels into groups by kmeans clustering of the pixel coordinates. These partitions will then be used for cross-validation, which should provide more realistic performance 
 measures if the data are spatially correlated.

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-12-03 04:41:46 UTC (rev 69983)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-12-03 17:41:29 UTC (rev 69984)
@@ -338,6 +338,12 @@
         max_features = str('auto')
     if max_depth == -1:
         max_depth = None
+
+    if (model == 'LinearDiscriminantAnalysis' or
+    model == 'QuadraticDiscriminantAnalysis' or
+    model == 'GaussianNB'):
+        grass.warning('No parameters to tune for selected model...ignoring')
+        tuning = False
         
     """
     Obtain information about GRASS rasters to be classified
@@ -365,7 +371,7 @@
 
     # load or sample training data
     X, y, Id, clf = sample_training_data(roi, maplist, cv, cvtype, model_load,
-                                         model_save, load_training,
+                                         load_training,
                                          save_training, lowmem, random_state)
 
     # determine the number of class labels using np.unique
@@ -386,21 +392,32 @@
     --------------------
     """
 
+    grass.message("Model=" + model)
+    clf, param_grid, mode =\
+        model_classifiers(model, random_state,
+                          class_weight, C, max_depth,
+                          max_features, min_samples_split,
+                          min_samples_leaf, n_estimators,
+                          subsample, learning_rate)
+
+    # check for classification or regression mode
+    if mode == 'regression' and probability is True:
+        grass.warning('Class probabilities only possible for classifications...ignoring')
+        probability = False
+
     # define classifier unless model is to be loaded from file
     if model_load == '':
 
-        grass.message("Model=" + model)
-        clf, param_grid, mode =\
-            model_classifiers(model, random_state,
-                              class_weight, C, max_depth,
-                              max_features, min_samples_split,
-                              min_samples_leaf, n_estimators,
-                              subsample, learning_rate)
+        # data splitting for automatic parameter tuning
+        if tuning is True:                
 
-        # data splitting for automatic parameter tuning
-        if tuning is True:
+            if mode == 'classification':
+                metric = 'accuracy'
+            else:
+                metric = 'r2'
+            
             X, X_devel, y, y_devel, Id, Id_devel, clf = \
-                tune_split(X, y, Id, clf, param_grid, ratio, random_state)
+                tune_split(X, y, Id, clf, metric, param_grid, ratio, random_state)
 
             grass.message('\n')
             grass.message('Searched parameters:')
@@ -478,9 +495,8 @@
 
             # output to GRASS message
             grass.message("\r\n")
-            grass.message("Normalized feature importances")
+            grass.message("Feature importances")
             grass.message("id" + "\t" + "Raster" + "\t" + "Importance")
-
             for i in range(len(clfimp)):
                 grass.message(
                     str(i) + "\t" + maplist[i] +
@@ -488,7 +504,7 @@
 
             if fimp_file != '':
                 fimp_output = pd.DataFrame(
-                    {'grass raster': maplist, 'importance': clfimp[:, 0]})
+                    {'grass raster': maplist, 'importance': clfimp})
                 fimp_output.to_csv(
                     path_or_buf=fimp_file,
                     header=['grass raster', 'importance'])
@@ -499,8 +515,12 @@
         """
 
         if model_save != '':
-            joblib.dump(clf, model_save + ".pkl")
+            joblib.dump(clf, model_save)
 
+            save_training_data(
+                X, y, Id, model_save.replace(".pkl", ".csv"))
+
+
         if modelonly is True:
             grass.fatal("Model built and now exiting")
 
@@ -508,7 +528,6 @@
     Prediction on the rest of the GRASS rasters in the imagery group
     ----------------------------------------------------------------
     """
-
     prediction(clf, labels, maplist, scaler, probability,
                rowincr, output, mode)