[GRASS-SVN] r69970 - grass-addons/grass7/raster/r.randomforest

Thu Dec 1 20:43:01 PST 2016

Author: spawley
Date: 2016-12-01 20:43:01 -0800 (Thu, 01 Dec 2016)
New Revision: 69970

Modified:
   grass-addons/grass7/raster/r.randomforest/Makefile
   grass-addons/grass7/raster/r.randomforest/ml_utils.py
   grass-addons/grass7/raster/r.randomforest/r.randomforest.html
   grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
Update manual and bug fixes to r.randomforest

Modified: grass-addons/grass7/raster/r.randomforest/Makefile
===================================================================

--- grass-addons/grass7/raster/r.randomforest/Makefile	2016-12-02 00:43:06 UTC (rev 69969)
+++ grass-addons/grass7/raster/r.randomforest/Makefile	2016-12-02 04:43:01 UTC (rev 69970)
@@ -5,6 +5,5 @@
 ETCFILES = ml_classifiers ml_utils
 
 include $(MODULE_TOPDIR)/include/Make/Script.make
-include $(MODULE_TOPDIR)/include/Make/Python.make
 
 default: script

Modified: grass-addons/grass7/raster/r.randomforest/ml_utils.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/ml_utils.py	2016-12-02 00:43:06 UTC (rev 69969)
+++ grass-addons/grass7/raster/r.randomforest/ml_utils.py	2016-12-02 04:43:01 UTC (rev 69970)
@@ -19,7 +19,6 @@
 from sklearn.cluster import KMeans
 
 
-
 def save_training_data(X, y, groups, file):
 
     """
@@ -83,7 +82,7 @@
     return(X, y, groups)
 
 
-def sample_predictors(response, predictors, shuffle_data=True, random_state=1):
+def sample_predictors(response, predictors, shuffle_data=True, lowmem=False, random_state=1):
 
     """
     Samples a list of GRASS rasters using a labelled raster
@@ -102,12 +101,20 @@
     y_indexes: Row and Columns of label positions
 
     """
-
+    current = Region()
+    
     # open response raster as rasterrow and read as np array
     if RasterRow(response).exist() is True:
         roi_gr = RasterRow(response)
         roi_gr.open('r')
-        response_np = np.array(roi_gr)
+        
+        if lowmem is False:        
+            response_np = np.array(roi_gr)
+        else:
+            response_np = np.memmap(grass.tempfile(create=False),
+                                    dtype='float32', mode='w+',
+                                    shape=(current.rows, current.cols))
+            response_np[:] = np.array(roi_gr)
     else:
         grass.fatal("GRASS response raster does not exist.... exiting")
 
@@ -130,10 +137,21 @@
     training_data = np.zeros((n_labels, n_features))
 
     # Loop through each raster and sample pixel values at training indexes
+    if lowmem is True:
+        tmp = grass.tempfile(create=False)
+
     for f in range(n_features):
         predictor_gr = RasterRow(predictors[f])
         predictor_gr.open('r')
-        feature_np = np.array(predictor_gr)
+        
+        if lowmem is False:
+            feature_np = np.array(predictor_gr)
+        else:
+            feature_np = np.memmap(tmp, dtype='float32', mode='w+',
+                                   shape=(current.rows, current.cols))    
+            
+            feature_np[:] = np.array(predictor_gr)
+
         training_data[0:n_labels, f] = feature_np[is_train]
         predictor_gr.close()
 
@@ -423,7 +441,8 @@
     return (clfimp)
 
 
-def sample_training_data(roi, maplist, cv, cvtype, model_load, model_save, load_training, save_training, random_state):
+def sample_training_data(roi, maplist, cv, cvtype, model_load, model_save,
+                         load_training, save_training, lowmem, random_state):
     
     # load the model or training data
     if model_load != '':
@@ -442,7 +461,8 @@
                 maplist2.append('tmp_roi_clumped')
                 X, y, sample_coords = sample_predictors(response=roi,
                                                         predictors=maplist2,
-                                                        shuffle_data=False)
+                                                        shuffle_data=False,
+                                                        lowmem=lowmem)
                  # take Id from last column
                 Id = X[:, X.shape[1]-1]
 
@@ -452,7 +472,8 @@
                 # query predictor rasters with training features
                 Id = None
                 X, y, sample_coords = sample_predictors(
-                    roi, maplist, shuffle_data=True, random_state=random_state)
+                    response=roi, predictors=maplist, shuffle_data=True,
+                    lowmem=lowmem, random_state=random_state)
 
             if save_training != '':
                 save_training_data(X, y, Id, save_training)

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.html
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.html	2016-12-02 00:43:06 UTC (rev 69969)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.html	2016-12-02 04:43:01 UTC (rev 69970)
@@ -1,35 +1,26 @@
 <h2>DESCRIPTION</h2>
 
-<em><b>r.randomforest</b></em> represents a front-end to the scikit learn machine learning python package for the purpose of performing classification and regression on a suite of predictors within a GRASS imagery group. The module also provides access random forest classification, and several other classifiers that are commonly used in remote sensing and spatial modelling. For more information concerning the details of any of the algorithms, consult the scikit-learn documentation directly. The choice of classifier is set using the <i>model</i> parameter.
+<em><b>r.randomforest</b></em> represents a front-end to the scikit learn python package for the purpose of performing classification and regression on GRASS rasters as part of an imagery group. The module enables classification and regression using random forests and several other classifiers that are commonly used in remote sensing and spatial modelling. The choice of classifier is set using the <i>model</i> parameter. The following classification and regression methods are available. For more details relating to the classifiers, refer to the <a href="http://scikit-learn.org/stable/">scikit learn documentation.</a>
 
 <p>
-The RandomForestsClassifier and RandomForestsRegressor (Breiman, 2001) options represent ensemble classification and regression tree methods, respectively. These methods construct a forest of uncorrelated decision trees based on a random subset of predictor variables, which occurs independently at every node split in each tree. Each tree produces a prediction probability, and the final classification result is obtained by averaging of the prediction probabilities across all of the trees. Random forests require relatively few user-specified parameter choices, principally consisting of the number of trees in the forest (<i>ntrees_rf</i>), and the number of variables that are allowed to be chosen from at each node split (<i>m_features_rf</i>), which controls the degree of correlation between the trees. Random forests also includes built-in accuracy assessment, termed the 'out-of-bag' (OOB) error. This is computed through bagging, where 33% of the training data are held-out during the c
 onstruction of each tree, and then OOB data are used to evaluate the prediction accuracy.
+<em><b>LogisticRegression</b></em> represents a linear model for classification rather than regression. Logistic regression is a modification of linear regression but using the logistic distribution, which enables the use of a categorical response variable. If the response raster (roi) is coded to 0 and 1, then a binary classification occurs, but the scikit-learn logistic regression can also perform a multiclass classification using a one-versus-rest scheme. <em><b>LinearDiscriminantAnalysis</b></em> and <em><b>QuadraticDiscriminantAnalysis</b></em> are classifiers with linear and quadratic decision surfaces. These classifiers do not take any parameters and are inherently multiclass. They can only be used for classification. Linear discriminant analysis can only separate groups using a linear decision boundary, while quadratic discriminant analysis can learn quadratic boundaries and therefore is more flexible.<em><b>GaussianNB</b></em> is the Gaussian Naive Bayes algorithm and can b
 e used for classification only. Naive Bayes is a supervised learning algorithm based on applying Bayes theorem with the naive assumption of independence between every pair of features. This classifier does not take any parameters. The Naive Bayes classifier is very fast and can be applied to high dimensional data because each predictor is assessed independently. However, the assumption of independence between predictors may not be appropriate for many datasets. The <em><b>DecisionTreeClassifier</b></em> and <em><b>DecisionTreeRegressor</b></em> models represent non-parametric supervised learning methods used for classification and regression. Decision tree classifiers map observations to a response variable using a hierarchy of splits and branches. The terminus of these branches, termed leaves, represent the prediction of the response variable. Decision trees are non-parametric and can model non-linear relationships between a response and predictor variables, and are insensitive the
  scaling of the predictors. Furthermore, the resulting models represent an intuitive structure where relationships between the response and predictors are easily visualized. The <em><b>RandomForestsClassifier</b></em> and <em><b>RandomForestsRegressor</b></em> (Breiman, 2001) models represent ensemble classification and regression tree methods. A disadvantage of single decision trees is that they tend to overfit the model and therefore are weak predictors. Random forests overcome some of these disadvantages by constructing an ensemble of uncorrelated decision trees. The trees are forced to be uncorrelated because only a random subset of predictor variables (represented by the rasters in the imagery group) are available during each node split in the tree. Each tree produces a prediction probability and the final classification result is obtained by averaging of the prediction probabilities across all of the trees. The <em><b>GradientBoostingClassifier</b></em> and <em><b>GradientBoos
 tingRegressor</b></em> also represent ensemble tree-based models. However, in a boosted model the learning processes is additive in a forward step-wise fashion, where <i>n_estimators</i> are fit during each model step and each model step is designed to better fit samples that are not currently well predicted by the previous step. This incrementally improves the performance of the entire model ensemble by fitting to the model residuals. The <em><b>SVC</b></em> model is C-Support Vector Classification. Only a linear kernel is supported because non-linear kernels using scikit learn for typical remote sensing and spatial analysis datasets which consist of large numbers of samples are too slow to be practical.
 
 <p>
-LogisticRegression, despite its name, represents a linear model for classification rather than regression. This module provides access to two parameters, <i>C_lr</i> the inverse of the regularization strength, and <i>i</i> which specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
+The Classifier parameters tab provides access to the most pertinent parameters that affect the previously described algorithms. <i>C</i> is the inverse of the regularization strength, which is when a penalty is applied to avoid overfitting. <i>C</i> applies to the LogisticRegression and SVC models. Most of the other parameters apply to the tree and ensemble-tree based classifiers. <i>n_estimators</i> represents the number of trees in Random Forest model, and the number of trees used in each model step during Gradient Boosting. <i>max_features</i> controls the number of variables that are allowed to be chosen from at each node split in the tree-based models, and can be considered to control the degree of correlation between the trees in ensemble tree methods. <i>min_samples_split</i> and <i>min_samples_leaf</i> control the number of samples required to split a node, or form a leaf node, respectively. The <i>learning_rate</i> and <i>subsample</i> parameters apply only to Gradient Boos
 ting. <i>learning_rate</i> shrinks the contribution of each tree, and <i>subsample</i> is the fraction of randomly selected samples for each tree, and values of &lt 1 reduce the model variance resulting in Stochastic Gradient Boosting. 
 
 <p>
-LinearDiscriminantAnalysis and QuadraticDiscriminantAnalysis are two classifiers with a linear and a quadratic decision surface, respectively. These classifiers do not take any parameters.
+In addition to model fitting and prediction, <em><b>r.randomforest</b></em> can be used for feature selection using the <i>f</i> flag. The linear model classifiers (LogisticRegression, SVC) provide fit coefficients which can be used to evaluate the importance of each of the predictors. Furthermore, the tree-based classifiers include an intrisic measure of variable importance based on the relative rank (depth) of a feature used as a decision node in a tree. If the classifier provides neither of these methods as is the case with LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis and GaussianNB, then univariate feature selection is used to provide feature importance scores. 
 
 <p>
-GaussianNB implements the Gaussian Naive Bayes algorithm for classification. Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of independence between every pair of features. This classifier does not take any parameters.
+Cross validation can be performed by setting the <i>cv</i> parameters to > 1. Cross-validation is performed using stratified kfolds, and multiple global and per-class accuracy measures are produced. Also note that this cross-validation is performed on a pixel basis. If there is a strong autocorrelation between pixels (i.e. the pixels represent polygons) then the training/test splits will not represent independent samples and will overestimate the accuracy. In this case, the <i>cvtype</i> parameter can be changed from 'non-spatial' to either 'clumped' or 'kmeans' to perform spatial cross-validation. Clumped spatial cross-validation is used if the training pixels represent polygons, and then cross-validation will be effectively performed on a polygon basis. Kmeans spatial cross-validation will partition the training pixels into groups by kmeans clustering of the pixel coordinates. These partitions will then be used for cross-validation, which should provide more realistic performance 
 measures if the data are spatially correlated.
 
 <p>
-The DecisionTreeClassifier and DecisionTreeRegressor represent non-parametric supervised learning methods used for classification and regression, respectively. Several parameter choices are available, relating to the node splitting method, the number of features to consider at each split, and the minimum number of samples in a split or leaf node.
+Most machine learning algorithms do not perform well in the case of a large class imbalance. In this case, the classifier will seek to reduce the overall model error, but this will occur by predicting the majority class with a very high accuracy, but at the expense of the minority class. If you have a highly imbalanced dataset, the 'balanced'  <i>b</i> flag can be set. The scikit-learn implementation balanced mode then automatically adjust weights inversely proportional to class frequencies. This only applies to the LogisticRegression, DecisionTree, RandomForest, and GradientBoostingClassifiers.
 
 <p>
-The GradientBoostingClassifier and GradientBoostingRegressor use an ensemble of boosted decision trees for classification and regression, respectively. Gradient tree boosting produces a prediction model in the form of an ensemble of weak prediction models from decision trees, but the model is built in a stage-wise fashion. Gradient tree boosting includes many parameter choices, although the module provides access to the most common parameters that may require tuning for optimal performance.
+Although tree-based classifiers are insensitive to the scaling of the input data, other classifiers such as <b>LogisticRegression</b> and <b>SVC</b> may not perform optimally if some predictors have variances that are orders of magnitude larger than others, and will therefore dominate the objective function. The <i>s</i> flag can be used to add a standardization preprocessing step to the classification and prediction, which will standardize each predictor relative to its standard deviation. 
 
 <p>
-The tree-based classifiers include a measure of variable importance based on the Gini impurity criterion, which measures how each variable contributes to the homogeneity of the nodes, with important variables causing a larger decrease in the Gini coefficient in successive node splits. This variable importance allows the contributions of the individual predictors to be determined. The feature importance scores are displayed in the command output.
-
-<p>
-Cross validation can be performed by setting the <i>cv</i> parameters to > 1. Cross-validation is performed using stratified kfolds, and multiple global and per-class accuracy measures are produced, consisting of accuracy, kappa, precision, recall, f1 measure, and if the response variable is binary (0,1), area under the receiver operating curve (auc). Note in a multiclass classification, the global precision, recall and f1 measures represent a weighted average of the per-class metrics (weighted by number of samples per class). Also note that this cross-validation is performed on a pixel basis. If there is a strong autocorrelation between pixels (i.e. the pixels represent polygons) then the training/test splits will not represent independent samples and will overestimate the accuracy. In this case you should train/split manually and use <i>i.kappa</i>.
-
-<p>
-Most machine learning algorithms do not perform well in the case of a large class imbalance. In this case, the classifier will seek to reduce the overall model error, but this will occur by predicting the majority class with a very high accuracy, but at the expense of the minority class. If you have a highly imbalanced dataset, the 'balanced'  <i>b</i> flag can be set. The scikit-learn implementation balanced mode then automatically adjust weights inversely proportional to class frequencies. This only applies to the LogisticRegression, DecisionTree, RandomForest, and GradientBoostingClassifiers.
-
-<p>
 The module also offers the ability to save and load a classification or regression model. The model is saved as a list of filenames (starting with the extension .pkl which is added automatically) for each numpy array. This list can involve a large number of files, so it makes sense to save each model in a separate directory. To load the model, you need to select the .pkl file that was saved. Saving and loading a model represents a useful feature because it allows a model to be built on one imagery group (ie. set of predictor variables), and then the prediction can be performed on other imagery groups. This approach is commonly employed in species prediction modelling, or landslide susceptibility modelling, where a classification or regression model is built with one set of predictors (e.g. which include present-day climatic variables) and then predictions can be performed on other imagery groups containing forecasted climatic variables.
 
 <p>
@@ -37,25 +28,14 @@
 
 <h2>NOTES</h2>
 
-<em><b>r.randomforest</b></em> uses the "scikit-learn" machine learning python package. This python package needs to be installed within your GRASS GIS Python environment for <em><b>r.randomforest</b></em> to work.
+<em><b>r.randomforest</b></em> uses the "scikit-learn" machine learning python package. This python package needs to be installed within your GRASS GIS Python environment for <em><b>r.randomforest</b></em> to work. It also needs the pandas python package. For Linux users, these packages should be available through the linux package manager in most distributions (named for example "python-scikit-learn"). For MS-Windows users using a 64 bit GRASS, the easiest way of installing the packages is by using the precompiled binaries from <a href="http://www.lfd.uci.edu/~gohlke/pythonlibs/">Christoph Gohlke</a> and by using the <a href="https://grass.osgeo.org/download/software/ms-windows/">OSGeo4W</a> installation method of GRASS, where the python setuptools can also be installed. You can then use 'easy_install pip' to install the pip package manager. Then, you can download the NumPy-1.10+MKL and scikit-learn .whl files and install them using 'pip install packagename.whl'. For MS-Windows wit
 h a 32 bit GRASS, scikit-learn is available in the OSGeo4W installer.
 
 <p>
-For Linux users, this package should be available through the linux package manager in most distributions (named for example "python-scikit-learn").
+<em><b>r.randomforest</b></em> is designed to keep system memory requirements relatively low. For this purpose, the rasters are read from the disk row-by-row, using the RasterRow method in PyGRASS. This however does not represent an efficient volume of data to pass to the classifiers, which are mostly multithreaded. Therefore, groups of rows specified by the <i>lines</i> parameter are passed to the classifier, and the reclassified image is reconstructed and written row-by-row back to the disk. <i>Lines=25</i> should be reasonable for most systems with 4-8 GB of ram. The row-by-row access however results in slow performance when sampling the imagery group to build the training data set. Instead, the default behaviour is to read each predictor into memory at a time. If this still exceeds the system memory then the <i>l</i> flag can be set to write each predictor to a numpy memmap file, and classification/regression can then be performed on rasters of any size irrespective of the avail
 able memory.
 
 <p>
-For MS-Windows users using a 64 bit GRASS, the easiest way of installing the packages is by using the precompiled binaries from <a href="http://www.lfd.uci.edu/~gohlke/pythonlibs/">Christoph Gohlke</a> and by using the <a href="https://grass.osgeo.org/download/software/ms-windows/">OSGeo4W</a> installation method of GRASS, where the python setuptools can also be installed. You can then use 'easy_install pip' to install the pip package manager. Then, you can download the NumPy-1.10+MKL and scikit-learn .whl files and install them using 'pip install packagename.whl'. For MS-Windows with a 32 bit GRASS, scikit-learn is available in the OSGeo4W installer.
-
-<p>
-<em><b>r.randomforest</b></em> is designed to keep system memory requirements relatively low. For this purpose, the rasters are read from the disk row-by-row, using the RasterRow method in PyGRASS. This however does not represent an efficient volume of data to pass to  the classifiers, which are mostly multithreaded. Therefore, groups of rows specified by the <i>lines</i> parameter are passed to the classifier, and the reclassified image is reconstructed and written row-by-row back to the disk. <i>Lines=25</i> should be reasonable for most systems with 4-8 GB of ram. The row-by-row access however results in slow performance when sampling the imagery group to build the training data set. Instead, the default behaviour is to read each predictor into memory at a time. If this still exceeds the system memory then the <i>l</i> flag can be set to perform row-by-row sampling, and classification/regression can then be performed on rasters of any size irrespective of the available memory (al
 though it will be slow).
-
-<p>
 Many of the classifiers involve a random process which can causes a small amount of variation in the classification results, out-of-bag error, and feature importances. To enable reproducible results, a seed is supplied to the classifier. This can be changed using the <i>randst</i> parameter.
 
-<h2>TODO</h2>
-
-Provide option to perform cross-validation on a polygon or region basis.
-Provide option to perform spatial and non-spatial cross-validation.
-
 <h2>EXAMPLE</h2>
 
 Here we are going to use the GRASS GIS sample North Carolina data set as a basis to perform a landsat classification. We are going to classify a Landsat 7 scene from 2000, using training information from an older (1996) land cover dataset.
@@ -79,7 +59,7 @@
 Then we can use these training pixels to perform a classification on the more recently obtained landsat 7 image:
 <div class="code"><pre>
 r.randomforest igroup=lsat7_2000 roi=landclass96_roi output=rf_classification \
-  model=RandomForestClassifier ntrees_rf=500 m_features_rf=-1 minsplit_rf=2 randst=1 lines=25
+  model=RandomForestClassifier n_estimators=500 max_features=-1 min_samples_split=2 randst=1 lines=25
 
 # copy category labels from landclass training map to result
 r.category rf_classification raster=landclass96_roi

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-12-02 00:43:06 UTC (rev 69969)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-12-02 04:43:01 UTC (rev 69970)
@@ -78,7 +78,7 @@
 #%option
 #% key: min_samples_split
 #% type: integer
-#% description: The minimum number of samples required to split an internal node for tree-based classifiers
+#% description: The minimum number of samples required for node splitting in tree-based classifiers
 #% answer: 2
 #% guisection: Classifier Parameters
 #%end
@@ -86,7 +86,7 @@
 #%option
 #% key: min_samples_leaf
 #% type: integer
-#% description: The minimum number of samples required to be at a leaf node for tree-based classifiers
+#% description: The minimum number of samples required to form a leaf node for tree-based classifiers
 #% answer: 1
 #% guisection: Classifier Parameters
 #%end
@@ -118,15 +118,14 @@
 # General options
 
 #%flag
-#% key: g
-#% label: Print as a shell style script
+#% key: l
+#% label: Use memory swap
 #% guisection: Optional
 #%end
 
-
 #%flag
-#% key: n
-#% label: Normalization
+#% key: s
+#% label: Standardization preprocessing
 #% guisection: Optional
 #%end
 
@@ -301,7 +300,7 @@
     roi = options['roi']
     output = options['output']
     model = options['model']
-    norm_data = flags['n']
+    norm_data = flags['s']
     cv = int(options['cv'])
     cvtype = options['cvtype']
     modelonly = flags['m']
@@ -314,7 +313,7 @@
     save_training = options['save_training']
     importances = flags['f']
     tuning = flags['h']
-    shell = flags['g']
+    lowmem = flags['l']
     ratio = float(options['ratio'])
     errors_file = options['errors_file']
     fimp_file = options['fimp_file']
@@ -366,8 +365,9 @@
     # load or sample training data
     X, y, Id, clf = sample_training_data(roi, maplist, cv, cvtype, model_load,
                                          model_save, load_training,
-                                         save_training, random_state)
-
+                                         save_training, lowmem, random_state)
+                            
+                                         
     # determine the number of class labels using np.unique
     labels = np.unique(y)
 
@@ -402,13 +402,12 @@
             X, X_devel, y, y_devel, Id, Id_devel, clf = \
                 tune_split(X, y, Id, clf, param_grid, ratio, random_state)
 
-            if shell is False:
-                grass.message('\n')
-                grass.message('Searched parameters:')
-                grass.message(str(clf.param_grid))
-                grass.message('\n')
-                grass.message('Best parameters:')
-                grass.message(str(clf.best_params_))
+            grass.message('\n')
+            grass.message('Searched parameters:')
+            grass.message(str(clf.param_grid))
+            grass.message('\n')
+            grass.message('Best parameters:')
+            grass.message(str(clf.best_params_))
 
             clf = clf.best_estimator_