[GRASS-SVN] r69984 - grass-addons/grass7/raster/r.randomforest
svn_grass at osgeo.org
svn_grass at osgeo.org
Sat Dec 3 09:41:29 PST 2016
Author: spawley
Date: 2016-12-03 09:41:29 -0800 (Sat, 03 Dec 2016)
New Revision: 69984
Modified:
grass-addons/grass7/raster/r.randomforest/ml_utils.py
grass-addons/grass7/raster/r.randomforest/r.randomforest.html
grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
bug fix to r.randomforest
Modified: grass-addons/grass7/raster/r.randomforest/ml_utils.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/ml_utils.py 2016-12-03 04:41:46 UTC (rev 69983)
+++ grass-addons/grass7/raster/r.randomforest/ml_utils.py 2016-12-03 17:41:29 UTC (rev 69984)
@@ -1,6 +1,7 @@
import numpy as np
import grass.script as grass
import tempfile
+import copy
from grass.pygrass.raster import RasterRow
from grass.pygrass.gis.region import Region
from grass.pygrass.raster.buffer import Buffer
@@ -9,7 +10,6 @@
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
-from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.utils import shuffle
@@ -387,8 +387,11 @@
y_pred_agg = np.append(y_pred_agg, y_pred)
# calculate metrics
- scores['accuracy'] = np.append(
- scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+ try:
+ scores['accuracy'] = np.append(
+ scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+ except:
+ pass
scores['r2'] = np.append(
scores['r2'], metrics.r2_score(y_test, y_pred))
@@ -406,7 +409,7 @@
return(scores, y_test_agg, y_pred_agg)
-def tune_split(X, y, Id, estimator, params, test_size, random_state):
+def tune_split(X, y, Id, estimator, metric, params, test_size, random_state):
if Id is None:
X, X_devel, y, y_devel = train_test_split(X, y, test_size=test_size,
@@ -417,7 +420,7 @@
random_state=random_state, stratify=Id)
clf = GridSearchCV(estimator=estimator, cv=3, param_grid=params,
- scoring="accuracy", n_jobs=-1)
+ scoring=metric, n_jobs=-1)
clf.fit(X_devel, y_devel)
@@ -426,25 +429,17 @@
def feature_importances(clf, X, y):
- min_max_scaler = preprocessing.MinMaxScaler()
-
try:
- clfimp = min_max_scaler.fit_transform(
- clf.feature_importances_.reshape(-1, 1))
+ clfimp = clf.feature_importances_
except:
- try:
- clfimp = min_max_scaler.fit_transform(
- abs(clf.coef_.T).reshape(-1, 1))
- except:
- sk = SelectKBest(f_classif, k='all')
- sk_fit = sk.fit(X, y)
- clfimp = min_max_scaler.fit_transform(
- sk_fit.scores_.reshape(-1, 1))
+ sk = SelectKBest(f_classif, k='all')
+ sk_fit = sk.fit(X, y)
+ clfimp = sk_fit.scores_
return (clfimp)
-def sample_training_data(roi, maplist, cv, cvtype, model_load, model_save,
+def sample_training_data(roi, maplist, cv, cvtype, model_load,
load_training, save_training, lowmem, random_state):
# load the model or training data
@@ -460,17 +455,18 @@
# create clumped roi for spatial cross validation
if cv > 1 and cvtype == 'clumped':
r.clump(input=roi, output='tmp_roi_clumped', overwrite=True, quiet=True)
- maplist2 = maplist
+ maplist2 = copy.deepcopy(maplist)
maplist2.append('tmp_roi_clumped')
X, y, sample_coords = sample_predictors(response=roi,
predictors=maplist2,
shuffle_data=False,
- lowmem=lowmem)
+ lowmem=lowmem,
+ random_state=random_state)
# take Id from last column
- Id = X[:, X.shape[1]-1]
+ Id = X[:, -1]
# remove Id column from predictors
- X = X[:, 0:X.shape[1]]
+ X = X[:, 0:X.shape[1]-1]
else:
# query predictor rasters with training features
Id = None
@@ -487,8 +483,6 @@
if save_training != '':
save_training_data(X, y, Id, save_training)
-
- if model_save != '':
- save_training_data(X, y, Id, model_save + ".csv")
+
return (X, y, Id, clf)
Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.html
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.html 2016-12-03 04:41:46 UTC (rev 69983)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.html 2016-12-03 17:41:29 UTC (rev 69984)
@@ -9,7 +9,7 @@
The Classifier parameters tab provides access to the most pertinent parameters that affect the previously described algorithms. <i>C</i> is the inverse of the regularization strength, which is when a penalty is applied to avoid overfitting. <i>C</i> applies to the LogisticRegression and SVC models. Most of the other parameters apply to the tree and ensemble-tree based classifiers. <i>n_estimators</i> represents the number of trees in Random Forest model, and the number of trees used in each model step during Gradient Boosting. <i>max_features</i> controls the number of variables that are allowed to be chosen from at each node split in the tree-based models, and can be considered to control the degree of correlation between the trees in ensemble tree methods. <i>min_samples_split</i> and <i>min_samples_leaf</i> control the number of samples required to split a node, or form a leaf node, respectively. The <i>learning_rate</i> and <i>subsample</i> parameters apply only to Gradient Boos
ting. <i>learning_rate</i> shrinks the contribution of each tree, and <i>subsample</i> is the fraction of randomly selected samples for each tree, and values of < 1 reduce the model variance resulting in Stochastic Gradient Boosting.
<p>
-In addition to model fitting and prediction, <em><b>r.randomforest</b></em> can be used for feature selection using the <i>f</i> flag. The linear model classifiers (LogisticRegression, SVC) provide fit coefficients which can be used to evaluate the importance of each of the predictors. Furthermore, the tree-based classifiers include an intrisic measure of variable importance based on the relative rank (depth) of a feature used as a decision node in a tree. If the classifier provides neither of these methods as is the case with LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis and GaussianNB, then univariate feature selection is used to provide feature importance scores.
+In addition to model fitting and prediction, <em><b>r.randomforest</b></em> can be used for feature selection using the <i>f</i> flag. The tree-based classifiers include an intrisic measure of variable importance based on the relative rank (depth) of a feature used as a decision node in a tree. For other classifiers, univariate feature selection is used to provide feature importance scores.
<p>
Cross validation can be performed by setting the <i>cv</i> parameters to > 1. Cross-validation is performed using stratified kfolds, and multiple global and per-class accuracy measures are produced. Also note that this cross-validation is performed on a pixel basis. If there is a strong autocorrelation between pixels (i.e. the pixels represent polygons) then the training/test splits will not represent independent samples and will overestimate the accuracy. In this case, the <i>cvtype</i> parameter can be changed from 'non-spatial' to either 'clumped' or 'kmeans' to perform spatial cross-validation. Clumped spatial cross-validation is used if the training pixels represent polygons, and then cross-validation will be effectively performed on a polygon basis. Kmeans spatial cross-validation will partition the training pixels into groups by kmeans clustering of the pixel coordinates. These partitions will then be used for cross-validation, which should provide more realistic performance
measures if the data are spatially correlated.
Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-12-03 04:41:46 UTC (rev 69983)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-12-03 17:41:29 UTC (rev 69984)
@@ -338,6 +338,12 @@
max_features = str('auto')
if max_depth == -1:
max_depth = None
+
+ if (model == 'LinearDiscriminantAnalysis' or
+ model == 'QuadraticDiscriminantAnalysis' or
+ model == 'GaussianNB'):
+ grass.warning('No parameters to tune for selected model...ignoring')
+ tuning = False
"""
Obtain information about GRASS rasters to be classified
@@ -365,7 +371,7 @@
# load or sample training data
X, y, Id, clf = sample_training_data(roi, maplist, cv, cvtype, model_load,
- model_save, load_training,
+ load_training,
save_training, lowmem, random_state)
# determine the number of class labels using np.unique
@@ -386,21 +392,32 @@
--------------------
"""
+ grass.message("Model=" + model)
+ clf, param_grid, mode =\
+ model_classifiers(model, random_state,
+ class_weight, C, max_depth,
+ max_features, min_samples_split,
+ min_samples_leaf, n_estimators,
+ subsample, learning_rate)
+
+ # check for classification or regression mode
+ if mode == 'regression' and probability is True:
+ grass.warning('Class probabilities only possible for classifications...ignoring')
+ probability = False
+
# define classifier unless model is to be loaded from file
if model_load == '':
- grass.message("Model=" + model)
- clf, param_grid, mode =\
- model_classifiers(model, random_state,
- class_weight, C, max_depth,
- max_features, min_samples_split,
- min_samples_leaf, n_estimators,
- subsample, learning_rate)
+ # data splitting for automatic parameter tuning
+ if tuning is True:
- # data splitting for automatic parameter tuning
- if tuning is True:
+ if mode == 'classification':
+ metric = 'accuracy'
+ else:
+ metric = 'r2'
+
X, X_devel, y, y_devel, Id, Id_devel, clf = \
- tune_split(X, y, Id, clf, param_grid, ratio, random_state)
+ tune_split(X, y, Id, clf, metric, param_grid, ratio, random_state)
grass.message('\n')
grass.message('Searched parameters:')
@@ -478,9 +495,8 @@
# output to GRASS message
grass.message("\r\n")
- grass.message("Normalized feature importances")
+ grass.message("Feature importances")
grass.message("id" + "\t" + "Raster" + "\t" + "Importance")
-
for i in range(len(clfimp)):
grass.message(
str(i) + "\t" + maplist[i] +
@@ -488,7 +504,7 @@
if fimp_file != '':
fimp_output = pd.DataFrame(
- {'grass raster': maplist, 'importance': clfimp[:, 0]})
+ {'grass raster': maplist, 'importance': clfimp})
fimp_output.to_csv(
path_or_buf=fimp_file,
header=['grass raster', 'importance'])
@@ -499,8 +515,12 @@
"""
if model_save != '':
- joblib.dump(clf, model_save + ".pkl")
+ joblib.dump(clf, model_save)
+ save_training_data(
+ X, y, Id, model_save.replace(".pkl", ".csv"))
+
+
if modelonly is True:
grass.fatal("Model built and now exiting")
@@ -508,7 +528,6 @@
Prediction on the rest of the GRASS rasters in the imagery group
----------------------------------------------------------------
"""
-
prediction(clf, labels, maplist, scaler, probability,
rowincr, output, mode)
More information about the grass-commit
mailing list