[GRASS-SVN] r70465 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Tue Jan 31 21:16:32 PST 2017
Author: spawley
Date: 2017-01-31 21:16:32 -0800 (Tue, 31 Jan 2017)
New Revision: 70465
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
Log:
'fixed indentation bug in oversampling'
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-01-31 14:40:49 UTC (rev 70464)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-02-01 05:16:32 UTC (rev 70465)
@@ -289,7 +289,6 @@
#%rules
#% exclusive: trainingmap,load_model
#% exclusive: load_training,save_training
-
#%end
import atexit
@@ -335,12 +334,12 @@
self.enc = None
self.categorical_var = categorical_var
self.category_values = None
-
+
if self.categorical_var:
self.onehotencode()
-
+
# for standardization
- if standardize == True:
+ if standardize is True:
self.standardization()
else:
self.scaler = None
@@ -350,49 +349,47 @@
self.scores_cm = None
self.fimp = None
-
def random_oversampling(self, X, y, random_state=None):
"""
Balances X, y observations using simple oversampling
-
+
Args
----
X: numpy array of training data
y: 1D numpy array of response data
random_state: Seed to pass onto random number generator
-
+
Returns
-------
X_resampled: Numpy array of resampled training data
y_resampled: Numpy array of resampled response data
"""
-
+
np.random.seed(seed=random_state)
-
+
# count the number of observations per class
y_classes = np.unique(y)
class_counts = np.histogram(y, bins=len(y_classes))[0]
maj_counts = class_counts.max()
-
+
y_resampled = y
X_resampled = X
-
+
for cla, counts in zip(y_classes, class_counts):
# get the number of samples needed to balance minority class
num_samples = maj_counts - counts
-
+
# get the indices of the ith class
- indx = np.nonzero(y==cla)
-
- # create some new indices
+ indx = np.nonzero(y == cla)
+
+ # create some new indices
oversamp_indx = np.random.choice(indx[0], size=num_samples)
-
+
# concatenate to the original X and y
y_resampled = np.concatenate((y[oversamp_indx], y_resampled))
X_resampled = np.concatenate((X[oversamp_indx], X_resampled))
-
- return (X_resampled, y_resampled)
+ return (X_resampled, y_resampled)
def onehotencode(self):
"""
@@ -406,14 +403,13 @@
self.category_values = [0] * len(self.categorical_var)
for i, cat in enumerate(self.categorical_var):
self.category_values[i] = np.unique(self.X[:, cat])
-
+
# fit and transform categorical grids to a suite of binary features
self.enc = OneHotEncoder(categorical_features=self.categorical_var,
sparse=False)
self.enc.fit(self.X)
- self.X = self.enc.transform(self.X)
+ self.X = self.enc.transform(self.X)
-
def fit(self, param_distributions=None, param_grid=None, n_iter=3, cv=3,
random_state=None):
@@ -423,7 +419,7 @@
Args
----
- param_distributions: continuous parameter distribution to be used in a
+ param_distributions: continuous parameter distribution to be used in a
randomizedCVsearch
param_grid: Dist of non-continuous parameters to grid search
n_iter: Number of randomized search iterations
@@ -433,11 +429,12 @@
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import GroupKFold
-
+
# Balance classes
- if self.balance == True:
- X, y = self.random_oversampling(self.X, self.y, random_state=random_state)
-
+ if self.balance is True:
+ X, y = self.random_oversampling(
+ self.X, self.y, random_state=random_state)
+
if self.groups is not None:
groups, _ = self.random_oversampling(
self.groups, self.y, random_state=random_state)
@@ -450,13 +447,13 @@
# Randomized or grid search
if param_distributions is not None or param_grid is not None:
-
+
# use groupkfold for hyperparameter search if groups are present
if self.groups is not None:
cv_search = GroupKFold(n_splits=cv)
else:
cv_search = cv
-
+
# Randomized search
if param_distributions is not None:
self.estimator = RandomizedSearchCV(
@@ -464,31 +461,30 @@
param_distributions=param_distributions,
n_iter=n_iter,
cv=cv_search)
-
+
# Grid Search
if param_grid is not None:
self.estimator = GridSearchCV(self.estimator,
param_grid,
n_jobs=-1, cv=cv_search)
-
+
# if groups then fit RandomizedSearchCV.fit requires groups param
if self.groups is None:
self.estimator.fit(X, y)
else:
self.estimator.fit(X, y, groups=groups)
-
+
# Fitting without parameter search
else:
self.estimator.fit(X, y)
-
def standardization(self):
"""
Transforms the non-categorical X
"""
from sklearn.preprocessing import StandardScaler
-
+
# create mask so that indices that represent categorical
# predictors are not selected
if self.categorical_var is not None:
@@ -498,12 +494,11 @@
else:
mask = np.arange(self.X.shape[1])
- X_continuous = self.X[:, mask]
+ X_continuous = self.X[:, mask]
self.scaler = StandardScaler()
self.scaler.fit(X_continuous)
- self.X[:, mask] = self.scaler.transform(X_continuous)
+ self.X[:, mask] = self.scaler.transform(X_continuous)
-
def pred_func(self, estimator, X_test, y_true, scorers):
"""
Calculates a single performance metric depending on if scorer type
@@ -535,7 +530,6 @@
return (score)
-
def varImp_permutation(self, estimator, X_test, y_true,
n_permutations, scorers,
random_state):
@@ -584,14 +578,14 @@
# fit the model on the training data and predict the test data
scores[rep, i] = best_score-self.pred_func(
estimator, Xscram, y_true, scorers)
- if scores[rep, i] < 0: scores[rep, i] = 0
+ if scores[rep, i] < 0:
+ scores[rep, i] = 0
# average the repetitions
scores = scores.mean(axis=0)
return(scores)
-
def specificity_score(self, y_true, y_pred):
from sklearn.metrics import confusion_matrix
@@ -599,15 +593,14 @@
cm = confusion_matrix(y_true, y_pred)
tn = float(cm[0][0])
- #fn = float(cm[1][0])
- #tp = float(cm[1][1])
+ # fn = float(cm[1][0])
+ # tp = float(cm[1][1])
fp = float(cm[0][1])
specificity = tn/(tn+fp)
return (specificity)
-
def cross_val(self, scorers='binary', cv=3, feature_importances=False,
n_permutations=25, random_state=None):
@@ -673,34 +666,36 @@
# get indices for train and test partitions
X_train, X_test = self.X[train_indices], self.X[test_indices]
- y_train, y_test = self.y[train_indices], self.y[test_indices]
-
+ y_train, y_test = self.y[train_indices], self.y[test_indices]
+
# balance the fold
- if self.balance == True:
- X_train, y_train = self.random_oversampling(X_train, y_train, random_state=random_state)
+ if self.balance is True:
+ X_train, y_train = self.random_oversampling(
+ X_train, y_train, random_state=random_state)
if self.groups is not None:
groups_train = self.groups[train_indices]
groups_train, _ = self.random_oversampling(
- groups_train, self.y[train_indices], random_state=random_state)
+ groups_train, self.y[train_indices],
+ random_state=random_state)
else:
# also get indices of groups for the training partition
if self.groups is not None:
groups_train = self.groups[train_indices]
-
+
# fit the model on the training data and predict the test data
- # need the groups parameter because the estimator can be a
+ # need the groups parameter because the estimator can be a
# RandomizedSearchCV estimator where cv=GroupKFold
- if isinstance(self.estimator, RandomizedSearchCV) == True \
- or isinstance(self.estimator, GridSearchCV):
+ if isinstance(self.estimator, RandomizedSearchCV) is True \
+ or isinstance(self.estimator, GridSearchCV):
param_search = True
else:
param_search = False
-
- if self.groups is not None and param_search == True:
+
+ if self.groups is not None and param_search is True:
fit = self.estimator.fit(X_train, y_train, groups=groups_train)
else:
- fit = self.estimator.fit(X_train, y_train)
+ fit = self.estimator.fit(X_train, y_train)
y_pred = fit.predict(X_test)
@@ -755,7 +750,7 @@
self.scores['r2'], metrics.r2_score(y_test, y_pred))
# feature importances using permutation
- if feature_importances == True:
+ if feature_importances is True:
if (self.fimp==0).all() == True:
self.fimp = self.varImp_permutation(
fit, X_test, y_test, n_permutations, scorers,
@@ -770,33 +765,35 @@
# convert onehot-encoded feature importances back to original vars
if self.fimp is not None and self.enc is not None:
-
+
from copy import deepcopy
# get start,end positions of each suite of onehot-encoded vars
feature_ranges = deepcopy(self.enc.feature_indices_)
for i in range(0, len(self.enc.feature_indices_)-1):
- feature_ranges[i+1] = feature_ranges[i] + len(self.category_values[i])
-
+ feature_ranges[i+1] =\
+ feature_ranges[i] + len(self.category_values[i])
+
# take sum of each onehot-encoded feature
ohe_feature = [0] * len(self.categorical_var)
ohe_sum = [0] * len(self.categorical_var)
-
+
for i in range(len(self.categorical_var)):
- ohe_feature[i] = self.fimp[:, feature_ranges[i]:feature_ranges[i+1]]
+ ohe_feature[i] = \
+ self.fimp[:, feature_ranges[i]:feature_ranges[i+1]]
ohe_sum[i] = ohe_feature[i].sum(axis=1)
-
+
# remove onehot-encoded features from the importances array
features_for_removal = np.array(range(feature_ranges[-1]))
- self.fimp = np.delete(self.fimp, features_for_removal, axis=1)
-
+ self.fimp = np.delete(self.fimp, features_for_removal, axis=1)
+
# insert summed importances into original positions
for index in self.categorical_var:
- self.fimp = np.insert(self.fimp, np.array(index), ohe_sum[0], axis=1)
+ self.fimp = np.insert(
+ self.fimp, np.array(index), ohe_sum[0], axis=1)
-
def predict(self, predictors, output, class_probabilities=False,
- rowincr=25):
+ rowincr=25):
"""
Prediction on list of GRASS rasters using a fitted scikit learn model
@@ -833,11 +830,10 @@
grass.fatal("GRASS raster " + predictors[i] +
" does not exist.... exiting")
- # use grass.pygrass.gis.region to get information about the current region
current = Region()
# create a imagery mask
- # the input rasters might have different dimensions and non-value pixels.
+ # the input rasters might have different dimensions and null pixels.
# r.series used to automatically create a mask by propagating the nulls
grass.run_command("r.series", output='tmp_clfmask',
input=predictors, method='count', flags='n',
@@ -889,7 +885,7 @@
mask_np_row[mask_np_row == -2147483648] = np.nan
nanmask = np.isnan(mask_np_row) # True in mask means invalid data
-
+
# reshape each row-band matrix into a n*m array
nsamples = rowincr * current.cols
flat_pixels = img_np_row.reshape((nsamples, n_features))
@@ -907,8 +903,11 @@
# on the training samples, but the prediction data contains
# new values, i.e. the training data has not sampled all of
# categories
- grass.fatal('There are values in the categorical rasters that are not present in the training data set, i.e. the training data has not sampled all of the categories')
-
+ grass.fatal('There are values in the categorical rasters ',
+ 'that are not present in the training data ',
+ 'set, i.e. the training data has not sampled ',
+ 'all of the categories')
+
# rescale
if self.scaler is not None:
# create mask so that indices that represent categorical
@@ -919,8 +918,9 @@
mask[self.categorical_var] = False
else:
mask = np.arange(self.X.shape[1])
- flat_pixels_continuous = flat_pixels[:, mask]
- flat_pixels[:, mask] = self.scaler.transform(flat_pixels_continuous)
+ flat_pixels_continuous = flat_pixels[:, mask]
+ flat_pixels[:, mask] = self.scaler.transform(
+ flat_pixels_continuous)
# perform prediction
result = self.estimator.predict(flat_pixels)
@@ -981,7 +981,7 @@
grass.run_command("g.remove", name='tmp_clfmask',
flags="f", type="raster", quiet=True)
grass.run_command("g.remove", name='tmp_roi_clumped',
- flags="f", type="raster", quiet=True)
+ flags="f", type="raster", quiet=True)
def model_classifiers(estimator='LogisticRegression', random_state=None,
@@ -1029,9 +1029,9 @@
from sklearn.pipeline import Pipeline
from pyearth import Earth
- # Combine Earth with LogisticRegression in a pipeline to do classification
earth_classifier = Pipeline([('Earth',
- Earth(max_degree=max_degree)), ('Logistic', LogisticRegression())])
+ Earth(max_degree=max_degree)),
+ ('Logistic', LogisticRegression())])
classifiers = {'EarthClassifier': earth_classifier,
'EarthRegressor': Earth(max_degree=max_degree)}
@@ -1045,10 +1045,10 @@
LogisticRegression(C=C, random_state=random_state, n_jobs=-1),
'DecisionTreeClassifier':
DecisionTreeClassifier(max_depth=max_depth,
- max_features=max_features,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- random_state=random_state),
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state),
'DecisionTreeRegressor':
DecisionTreeRegressor(max_features=max_features,
min_samples_split=min_samples_split,
@@ -1234,7 +1234,7 @@
# Loop through each raster and sample pixel values at training indexes
if lowmem is True:
feature_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
+ dtype='float32', mode='w+',
shape=(current.rows, current.cols))
for f in range(n_features):
@@ -1401,7 +1401,7 @@
cv = int(options['cv'])
cvtype = options['cvtype']
group_raster = options['group_raster']
- categorymaps = options['categorymaps']
+ categorymaps = options['categorymaps']
n_partitions = int(options['n_partitions'])
modelonly = flags['m']
probability = flags['p']
@@ -1418,22 +1418,22 @@
errors_file = options['errors_file']
fimp_file = options['fimp_file']
balance = flags['b']
-
+
if ',' in categorymaps:
categorymaps = [int(i) for i in categorymaps.split(',')]
else:
categorymaps = None
-
+
param_grid = {'C': None,
- 'min_samples_split': None,
- 'min_samples_leaf': None,
- 'n_estimators': None,
- 'learning_rate': None,
- 'subsample': None,
- 'max_depth': None,
- 'max_features': None,
- 'max_degree': None}
-
+ 'min_samples_split': None,
+ 'min_samples_leaf': None,
+ 'n_estimators': None,
+ 'learning_rate': None,
+ 'subsample': None,
+ 'max_depth': None,
+ 'max_features': None,
+ 'max_degree': None}
+
# classifier options
C = options['c']
if ',' in C:
@@ -1441,17 +1441,19 @@
C = None
else:
C = float(C)
-
+
min_samples_split = options['min_samples_split']
if ',' in min_samples_split:
- param_grid['min_samples_split'] = [float(i) for i in min_samples_split.split(',')]
- min_samples_split = None
+ param_grid['min_samples_split'] = \
+ [float(i) for i in min_samples_split.split(',')]
+ min_samples_split = None
else:
min_samples_split = int(min_samples_split)
-
+
min_samples_leaf = options['min_samples_leaf']
if ',' in min_samples_leaf:
- param_grid['min_samples_leaf'] = [int(i) for i in min_samples_leaf.split(',')]
+ param_grid['min_samples_leaf'] = \
+ [int(i) for i in min_samples_leaf.split(',')]
min_samples_leaf = None
else:
min_samples_leaf = int(min_samples_leaf)
@@ -1465,7 +1467,8 @@
learning_rate = options['learning_rate']
if ',' in learning_rate:
- param_grid['learning_rate'] = [float(i) for i in learning_rate.split(',')]
+ param_grid['learning_rate'] = \
+ [float(i) for i in learning_rate.split(',')]
learning_rate = None
else:
learning_rate = float(learning_rate)
@@ -1486,24 +1489,25 @@
max_depth = None
else:
max_depth = float(max_depth)
-
+
max_features = options['max_features']
if max_features == '':
max_features = 'auto'
else:
if ',' in max_features:
- param_grid['max_features'] = [int(i) for i in max_features.split(',')]
+ param_grid['max_features'] = \
+ [int(i) for i in max_features.split(',')]
max_features = None
else:
max_features = int(max_features)
-
+
max_degree = options['max_degree']
if ',' in max_degree:
param_grid['max_degree'] = [int(i) for i in max_degree.split(',')]
max_degree = None
else:
max_degree = int(max_degree)
-
+
if importances is True and cv == 1:
grass.fatal('Feature importances require cross-validation cv > 1')
@@ -1541,22 +1545,23 @@
C, max_depth, max_features, min_samples_split,
min_samples_leaf, n_estimators,
subsample, learning_rate, max_degree)
-
+
# turn off balancing if mode = regression
- if mode == 'regression' and balance == True:
+ if mode == 'regression' and balance is True:
balance = False
# remove empty items from the param_grid dict
- param_grid = {k: v for k, v in param_grid.iteritems() if v != None}
-
+ param_grid = {k: v for k, v in param_grid.iteritems() if v is not None}
+
# check that dict keys are compatible for the selected classifier
clf_params = clf.get_params()
param_grid = { key: value for key, value in param_grid.iteritems() if key in clf_params}
-
+
# check if dict contains and keys, otherwise set it to None
# so that the train object will not perform GridSearchCV
- if any(param_grid) != True: param_grid = None
-
+ if any(param_grid) is not True:
+ param_grid = None
+
# Decide on scoring metric scheme
if mode == 'classification':
if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
@@ -1565,10 +1570,11 @@
scorers = 'multiclass'
else:
scorers = 'regression'
-
+
if mode == 'regression' and probability is True:
grass.warning(
- 'Class probabilities only valid for classifications...ignoring')
+ 'Class probabilities only valid for classifications...',
+ 'ignoring')
probability = False
# create training object - onehot-encoded on-the-fly
@@ -1581,7 +1587,8 @@
"""
# fit and parameter search
- learn_m.fit(param_grid=param_grid, cv=tune_cv, random_state=random_state)
+ learn_m.fit(param_grid=param_grid, cv=tune_cv,
+ random_state=random_state)
if param_grid is not None:
grass.message('\n')
@@ -1593,9 +1600,10 @@
grass.message('\r\n')
grass.message(
"Cross validation global performance measures......:")
-
+
# cross-validate the training object
- learn_m.cross_val(scorers, cv, importances, n_permutations=n_permutations,
+ learn_m.cross_val(scorers, cv, importances,
+ n_permutations=n_permutations,
random_state=random_state)
if mode == 'classification':
@@ -1656,7 +1664,9 @@
errors = pd.DataFrame(learn_m.scores)
errors.to_csv(errors_file, mode='w')
except:
- grass.warning("Pandas is not installed. Pandas is required to write the cross-validation results to file")
+ grass.warning('Pandas is not installed. Pandas is ',
+ 'required to write the cross-validation ',
+ 'results to file')
# feature importances
if importances is True:
@@ -1688,7 +1698,6 @@
if model_save != '':
joblib.dump(learn_m, model_save)
-
"""
Prediction on the rest of the GRASS rasters in the imagery group
----------------------------------------------------------------
More information about the grass-commit
mailing list