[GRASS-SVN] r70551 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Mon Feb 13 08:43:14 PST 2017
Author: spawley
Date: 2017-02-13 08:43:14 -0800 (Mon, 13 Feb 2017)
New Revision: 70551
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
grass-addons/grass7/raster/r.learn.ml/raster_learning.py
Log:
r.learn.ml refactoring of OOP to be more consistent with scikit-learn philisophy; cross-validation command output also reformatted to tab-delineated for easy cut and paste
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-02-13 15:31:59 UTC (rev 70550)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-02-13 16:43:14 UTC (rev 70551)
@@ -65,7 +65,7 @@
#% type: integer
#% description: Number of features to consider during splitting for tree based classifiers. Default is sqrt(n_features) for classification, and n_features for regression
#% required: no
-#% answer:
+#% answer:0
#% multiple: yes
#% guisection: Classifier Parameters
#%end
@@ -75,7 +75,7 @@
#% type: integer
#% description: Optionally specifiy maximum tree depth. Otherwise full-growing occurs for decision trees and random forests, and max_depth=3 for gradient boosting
#% required: no
-#% answer:
+#% answer:0
#% multiple: yes
#% guisection: Classifier Parameters
#%end
@@ -230,14 +230,6 @@
#%end
#%option
-#% key: tune_cv
-#% type: integer
-#% description: Number of cross-validation folds used for parameter tuning
-#% answer: 3
-#% guisection: Optional
-#%end
-
-#%option
#% key: n_permutations
#% type: integer
#% description: Number of permutations to perform for feature importances
@@ -299,6 +291,7 @@
#%end
import atexit
+import os
import numpy as np
import grass.script as grass
from copy import deepcopy
@@ -308,31 +301,39 @@
set_path('r.learn.ml')
from raster_learning import train, model_classifiers
from raster_learning import save_training_data, load_training_data
-from raster_learning import extract, maps_from_group
+from raster_learning import extract, maps_from_group, random_oversampling
+tmp_rast = []
+
def cleanup():
- grass.run_command("g.remove", name='tmp_clfmask',
- flags="f", type="raster", quiet=True)
- grass.run_command("g.remove", name='tmp_roi_clumped',
- flags="f", type="raster", quiet=True)
+ for rast in tmp_rast:
+ grass.run_command("g.remove", rast=rast, quiet=True)
def main():
+ """
+ Lazy imports for main------------------------------------------------------
+ """
+
try:
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.metrics import make_scorer, cohen_kappa_score
+ from sklearn.model_selection import StratifiedKFold, GroupKFold
+ from sklearn.preprocessing import StandardScaler
+ from sklearn.model_selection import GridSearchCV
+ import warnings
+ warnings.filterwarnings('ignore') # turn off UndefinedMetricWarning
except:
grass.fatal("Scikit learn 0.18 or newer is not installed")
"""
- GRASS options and flags
- -----------------------
+ GRASS options and flags----------------------------------------------------
"""
- # General options and flags
+ # General options and flags -----------------------------------------------
group = options['group']
response = options['trainingmap']
output = options['output']
@@ -352,7 +353,6 @@
load_training = options['load_training']
save_training = options['save_training']
importances = flags['f']
- tune_cv = int(options['tune_cv'])
n_permutations = int(options['n_permutations'])
lowmem = flags['l']
impute = flags['i']
@@ -365,6 +365,7 @@
else:
categorymaps = None
+ # classifier options and parameter grid settings --------------------------
param_grid = {'C': None,
'min_samples_split': None,
'min_samples_leaf': None,
@@ -375,7 +376,6 @@
'max_features': None,
'max_degree': None}
- # classifier options
C = options['c']
if ',' in C:
param_grid['C'] = [float(i) for i in C.split(',')]
@@ -422,7 +422,7 @@
subsample = float(subsample)
max_depth = options['max_depth']
- if max_depth == '':
+ if max_depth == '0':
max_depth = None
else:
if ',' in max_depth:
@@ -432,7 +432,7 @@
max_depth = int(max_depth)
max_features = options['max_features']
- if max_features == '':
+ if max_features == '0':
max_features = 'auto'
else:
if ',' in max_features:
@@ -449,15 +449,17 @@
else:
max_degree = int(max_degree)
+ # remove empty items from the param_grid dict
+ param_grid = {k: v for k, v in param_grid.iteritems() if v is not None}
+
if importances is True and cv == 1:
grass.fatal('Feature importances require cross-validation cv > 1')
- # fetch individual raster names from group
+ # fetch individual raster names from group --------------------------------
maplist, map_names = maps_from_group(group)
"""
- Sample training data and group ids
- --------------------
+ Sample training data and group ids ----------------------------------------
"""
if model_load == '':
@@ -466,13 +468,17 @@
if load_training != '':
X, y, group_id = load_training_data(load_training)
else:
+ grass.message('Extracting training data')
+
# clump the labelled pixel raster if labels represent polygons
# then set the group_raster to the clumped raster to extract the
# group_ids used in the GroupKFold cross-validation
if cvtype == 'clumped' and group_raster == '':
- r.clump(input=response, output='tmp_roi_clumped',
+ clumped_trainingmap = 'tmp_clumped_trainingmap'
+ tmp_rast.append(clumped_trainingmap)
+ r.clump(input=response, output=clumped_trainingmap,
overwrite=True, quiet=True)
- group_raster = 'tmp_roi_clumped'
+ group_raster = clumped_trainingmap
# extract training data from maplist and take group ids from
# group_raster. Shuffle=False so that group ids and labels align
@@ -488,15 +494,6 @@
# take group id from last column and remove from predictors
group_id = X[:, -1]
X = np.delete(X, -1, axis=1)
-
- # remove the clumped raster
- try:
- grass.run_command(
- "g.remove", name='tmp_roi_clumped', flags="f",
- type="raster", quiet=True)
- except:
- pass
-
else:
# extract training data from maplist without group Ids
# shuffle this data by default
@@ -527,26 +524,46 @@
save_training_data(X, y, group_id, save_training)
"""
- Train the classifier
- --------------------
+ Train the classifier --------------------------------------------------
"""
- # retrieve sklearn classifier object and parameters
- grass.message("Classifier = " + classifier)
-
+ # retrieve sklearn classifier object and parameters -------------------
clf, mode = \
model_classifiers(classifier, random_state,
C, max_depth, max_features, min_samples_split,
min_samples_leaf, n_estimators,
subsample, learning_rate, max_degree)
- # turn off balancing if mode = regression
- if mode == 'regression' and balance is True:
- balance = False
+ # set other parameters based on classification or regression ----------
+ if mode == 'classification':
+ if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+ scorers = 'binary'
+ else:
+ scorers = 'multiclass'
+ search_scorer = make_scorer(cohen_kappa_score)
+ labels = np.unique(y)
+ else:
+ scorers = 'regression'
+ search_scorer = 'r2'
+ labels = None # no classes
+ balance = False # no balancing for regression
+ if probability is True:
+ grass.warning(
+ 'Class probabilities only valid for classifications...'
+ 'ignoring')
+ probability = False
- # remove empty items from the param_grid dict
- param_grid = {k: v for k, v in param_grid.iteritems() if v is not None}
+ # setup model selection model -----------------------------------------
+ if any(param_grid) is True and cv == 1:
+ grass.fatal('Hyperparameter search requires cv > 1')
+ if any(param_grid) is True or cv > 1:
+ if group_id is None:
+ search_cv_method = StratifiedKFold(
+ n_splits=cv, random_state=random_state)
+ else:
+ search_cv_method = GroupKFold(n_splits=cv)
+ # set-up parameter grid for hyperparameter search ---------------------
# check that dict keys are compatible for the selected classifier
clf_params = clf.get_params()
param_grid = { key: value for key, value in param_grid.iteritems() if key in clf_params}
@@ -555,130 +572,172 @@
# so that the train object will not perform GridSearchCV
if any(param_grid) is not True:
param_grid = None
+ else:
+ clf = GridSearchCV(estimator=clf, param_grid=param_grid,
+ scoring=search_scorer, n_jobs=-1,
+ cv=search_cv_method)
- # Decide on scoring metric scheme and scorer to for grid search
- if mode == 'classification':
- if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
- scorers = 'binary'
- else:
- scorers = 'multiclass'
- search_scorer = make_scorer(cohen_kappa_score)
+ # preprocessing options -----------------------------------------------
+ if balance is True:
+ sampling = random_oversampling(random_state)
else:
- scorers = 'regression'
- search_scorer = 'r2'
+ sampling = None
- if mode == 'regression' and probability is True:
- grass.warning(
- 'Class probabilities only valid for classifications...'
- 'ignoring')
- probability = False
+ if norm_data is True:
+ scaler = StandardScaler()
+ else:
+ scaler = None
- # create training object - onehot-encoded on-the-fly
- learn_m = train(clf, X, y, group_id, categorical_var=categorymaps,
- standardize=norm_data, balance=balance)
+ # create training object ----------------------------------------------
+ learn_m = train(clf, categorical_var=categorymaps,
+ preprocessing=scaler, sampling=sampling)
"""
- Fitting, parameter search and cross-validation
- ----------------
+ Fitting, parameter search and cross-validation ------------------------
"""
# fit and parameter search
- learn_m.fit(param_grid=param_grid, cv=tune_cv, scoring=search_scorer,
- random_state=random_state)
+ grass.message(os.linesep)
+ grass.message(('Fitting model using ' + classifier))
+ learn_m.fit(X, y, group_id)
if param_grid is not None:
- grass.message('\n')
+ grass.message(os.linesep)
grass.message('Best parameters:')
grass.message(str(learn_m.estimator.best_params_))
-
+
# If cv > 1 then use cross-validation to generate performance measures
if cv > 1:
- grass.message('\r\n')
- grass.message(
- "Cross validation global performance measures......:")
+ # check that a sufficient number of samples are present per class
+ if cv > np.histogram(y, bins=len(labels))[0].min():
+ grass.message(os.linesep)
+ grass.message('Number of cv folds is greater than number of '
+ 'samples in some classes. Cross-validation is being'
+ ' skipped')
+ else:
+ grass.message(os.linesep)
+ grass.message(
+ "Cross validation global performance measures......:")
- # cross-validate the training object
- learn_m.cross_val(scorers, cv, importances,
- n_permutations=n_permutations,
- random_state=random_state)
+ # cross-validate the training object
+ learn_m.cross_val(search_cv_method, X, y, group_id,
+ scorers, importances,
+ n_permutations=n_permutations,
+ random_state=random_state)
- if mode == 'classification':
- if scorers == 'binary':
- grass.message(
- "Accuracy :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['accuracy'].mean(),
- learn_m.scores['accuracy'].std()))
- grass.message(
- "AUC :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['auc'].mean(),
- learn_m.scores['auc'].std()))
- grass.message(
- "Kappa :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['kappa'].mean(),
- learn_m.scores['kappa'].std()))
- grass.message(
- "Precision :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['precision'].mean(),
- learn_m.scores['precision'].std()))
- grass.message(
- "Recall :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['recall'].mean(),
- learn_m.scores['recall'].std()))
- grass.message(
- "Specificity:\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['specificity'].mean(),
- learn_m.scores['specificity'].std()))
- grass.message(
- "F1 :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['f1'].mean(),
- learn_m.scores['f1'].std()))
+ scores = learn_m.get_cross_val_scores()
- if scorers == 'multiclass':
- grass.message(
- "Accuracy:\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['accuracy'].mean(),
- learn_m.scores['accuracy'].std()))
- grass.message(
- "Kappa :\t%0.3f\t+/-SD\t%0.3f" %
- (learn_m.scores['kappa'].mean(),
- learn_m.scores['kappa'].std()))
+ if mode == 'classification':
+ if scorers == 'binary':
+ grass.message(
+ "Accuracy :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['accuracy'].mean(),
+ scores['accuracy'].std()))
+ grass.message(
+ "AUC :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['auc'].mean(),
+ scores['auc'].std()))
+ grass.message(
+ "Kappa :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['kappa'].mean(),
+ scores['kappa'].std()))
+ grass.message(
+ "Precision :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['precision'].mean(),
+ scores['precision'].std()))
+ grass.message(
+ "Recall :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['recall'].mean(),
+ scores['recall'].std()))
+ grass.message(
+ "Specificity:\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['specificity'].mean(),
+ scores['specificity'].std()))
+ grass.message(
+ "F1 :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['f1'].mean(),
+ scores['f1'].std()))
- # classification report
- grass.message("\n")
- grass.message("Classification report:")
- grass.message(learn_m.scores_cm)
+ if scorers == 'multiclass':
+ # global scores
+ grass.message(
+ "Accuracy:\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['accuracy'].mean(),
+ scores['accuracy'].std()))
+ grass.message(
+ "Kappa :\t%0.3f\t+/-SD\t%0.3f" %
+ (scores['kappa'].mean(),
+ scores['kappa'].std()))
- else:
- grass.message("R2:\t%0.3f\t+/-\t%0.3f" %
- (learn_m.scores['r2'].mean(),
- learn_m.scores['r2'].std()))
+ # per class scores
+ grass.message(os.linesep)
+ grass.message('Cross validation class performance measures......:')
+ mat_precision = np.matrix(scores['precision'])
+ mat_recall = np.matrix(scores['recall'])
+ mat_f1 = np.matrix(scores['f1'])
- # write cross-validation results for csv file
- if errors_file != '':
- try:
- import pandas as pd
- errors = pd.DataFrame(learn_m.scores)
- errors.to_csv(errors_file, mode='w')
- except:
- grass.warning('Pandas is not installed. Pandas is '
- 'required to write the cross-validation '
- 'results to file')
+ grass.message('Class \t' + '\t'.join(map(str, labels)))
+ grass.message(
+ 'Precision mean \t' + '\t'.join(
+ map(str, np.round(
+ mat_precision.mean(axis=0), 2)[0])))
+ grass.message(
+ 'Precision std \t' + '\t'.join(
+ map(str, np.round(
+ mat_precision.std(axis=0), 2)[0])))
+ grass.message(
+ 'Recall mean \t' + '\t'.join(
+ map(str, np.round(
+ mat_recall.mean(axis=0), 2)[0])))
+ grass.message(
+ 'Recall std \t' + '\t'.join(
+ map(str, np.round(
+ mat_recall.std(axis=0), 2)[0])))
+ grass.message(
+ 'F1 score mean \t' + '\t'.join(
+ map(str, np.round(
+ mat_f1.mean(axis=0), 2)[0])))
+ grass.message(
+ 'F1 score std \t' + '\t'.join(
+ map(str, np.round(
+ mat_f1.std(axis=0), 2)[0])))
- # feature importances
- if importances is True:
- grass.message("\r\n")
- grass.message("Feature importances")
- grass.message("id" + "\t" + "Raster" + "\t" + "Importance")
+ # remove perclass scores from dict
+ del scores['precision']
+ del scores['recall']
+ del scores['f1']
- # mean of cross-validation feature importances
- for i in range(len(learn_m.fimp.mean(axis=0))):
- grass.message(
- str(i) + "\t" + maplist[i] +
- "\t" + str(round(learn_m.fimp.mean(axis=0)[i], 4)))
+ else:
+ grass.message("R2:\t%0.3f\t+/-\t%0.3f" %
+ (scores['r2'].mean(),
+ scores['r2'].std()))
- if fimp_file != '':
- np.savetxt(fname=fimp_file, X=learn_m.fimp, delimiter=',',
- header=','.join(maplist), comments='')
+ # write cross-validation results for csv file
+ if errors_file != '':
+ try:
+ import pandas as pd
+ errors = pd.DataFrame(scores)
+ errors.to_csv(errors_file, mode='w')
+ except:
+ grass.warning('Pandas is not installed. Pandas is '
+ 'required to write the cross-validation '
+ 'results to file')
+
+ # feature importances
+ if importances is True:
+ grass.message(os.linesep)
+ grass.message("Feature importances")
+ grass.message("id" + "\t" + "Raster" + "\t" + "Importance")
+
+ # mean of cross-validation feature importances
+ for i in range(len(learn_m.fimp.mean(axis=0))):
+ grass.message(
+ str(i) + "\t" + maplist[i] +
+ "\t" + str(round(learn_m.fimp.mean(axis=0)[i], 4)))
+
+ if fimp_file != '':
+ np.savetxt(fname=fimp_file, X=learn_m.fimp, delimiter=',',
+ header=','.join(maplist), comments='')
else:
# load a previously fitted train object
# -------------------------------------
@@ -686,20 +745,18 @@
# load a previously fitted model
learn_m = joblib.load(model_load)
- """
- Optionally save the fitted model
- ---------------------
- """
-
+ # Optionally save the fitted model
if model_save != '':
joblib.dump(learn_m, model_save)
"""
- Prediction on the rest of the GRASS rasters in the imagery group
- ----------------------------------------------------------------
+ Prediction on the rest of the GRASS rasters in the imagery group ----------
"""
+
if modelonly is not True:
- learn_m.predict(maplist, output, probability, rowincr)
+ grass.message(os.linesep)
+ grass.message('Predicting raster...')
+ learn_m.predict(maplist, output, labels, probability, rowincr)
else:
grass.message("Model built and now exiting")
Modified: grass-addons/grass7/raster/r.learn.ml/raster_learning.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-02-13 15:31:59 UTC (rev 70550)
+++ grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-02-13 16:43:14 UTC (rev 70551)
@@ -1,12 +1,9 @@
# -*- coding: utf-8 -*-
-"""
-Created on Tue Feb 7 09:03:10 2017
- at author: steve
-"""
-
import os
+import scipy
import numpy as np
+from numpy.random import RandomState
from copy import deepcopy
import tempfile
import grass.script as grass
@@ -17,59 +14,27 @@
from subprocess import PIPE
-class train():
+class random_oversampling():
- def __init__(self, estimator, X, y, groups=None, categorical_var=None,
- standardize=False, balance=False):
+ def __init__(self, random_state):
"""
- Train class to perform preprocessing, fitting, parameter search and
- cross-validation in a single step
+ Balances X, y observations using simple oversampling
Args
----
- estimator: Scikit-learn compatible estimator object
- X, y: training data and labels as numpy arrays
- groups: groups to be used for cross-validation
- categorical_var: 1D list containing indices of categorical predictors
- standardize: Transform predictors
- balance: boolean to balance number of classes
+ random_state: Seed to pass onto random number generator
"""
- # fitting data
- self.estimator = estimator
- self.X = X
- self.y = y
- self.groups = groups
- self.balance = balance
+ self.random_state = random_state
- # for onehot-encoding
- self.enc = None
- self.categorical_var = categorical_var
- self.category_values = None
-
- if self.categorical_var:
- self.__onehotencode()
-
- # for standardization
- if standardize is True:
- self.standardization()
- else:
- self.scaler = None
-
- # for cross-validation scores
- self.scores = None
- self.scores_cm = None
- self.fimp = None
-
- def __random_oversampling(self, X, y, random_state=None):
+ def fit_sample(self, X, y):
"""
- Balances X, y observations using simple oversampling
+ Performs equal balancing of response and explanatory variances
Args
----
X: numpy array of training data
y: 1D numpy array of response data
- random_state: Seed to pass onto random number generator
Returns
-------
@@ -77,7 +42,7 @@
y_resampled: Numpy array of resampled response data
"""
- np.random.seed(seed=random_state)
+ np.random.seed(seed=self.random_state)
# count the number of observations per class
y_classes = np.unique(y)
@@ -103,10 +68,54 @@
return (X_resampled, y_resampled)
- def __onehotencode(self):
+
+class train():
+
+ def __init__(self, estimator, categorical_var=None,
+ preprocessing=None, sampling=None):
"""
+ Train class to perform preprocessing, fitting, parameter search and
+ cross-validation in a single step
+
+ Args
+ ----
+ estimator: Scikit-learn compatible estimator object
+ categorical_var: 1D list containing indices of categorical predictors
+ preprocessing: Sklearn preprocessing scaler
+ sampling: Balancing object e.g. from imbalance-learn
+ """
+
+ # fitting data
+ self.estimator = estimator
+
+ # for onehot-encoding
+ self.enc = None
+ self.categorical_var = categorical_var
+ self.category_values = None
+
+ if self.categorical_var:
+ self.__onehotencode()
+
+ # for preprocessing of data
+ self.sampling = sampling
+ self.preprocessing = preprocessing
+
+ # for cross-validation scores
+ self.scores = None
+ self.scores_cm = None
+ self.fimp = None
+ self.mean_tpr = None
+ self.mean_fpr = None
+
+ def __onehotencode(self, X):
+
+ """
Method to convert a list of categorical arrays in X into a suite of
- binary predictors which are added to the left of the array
+ binary predictors which are added to the end of the array
+
+ Args
+ ----
+ X: 2D numpy array containing training data
"""
from sklearn.preprocessing import OneHotEncoder
@@ -114,140 +123,89 @@
# store original range of values
self.category_values = [0] * len(self.categorical_var)
for i, cat in enumerate(self.categorical_var):
- self.category_values[i] = np.unique(self.X[:, cat])
+ self.category_values[i] = np.unique(X[:, cat])
# fit and transform categorical grids to a suite of binary features
self.enc = OneHotEncoder(categorical_features=self.categorical_var,
sparse=False)
- self.enc.fit(self.X)
- self.X = self.enc.transform(self.X)
+ self.enc.fit(X)
+ X = self.enc.transform(X)
- def fit(self, param_distributions=None, param_grid=None,
- scoring=None, n_iter=3, cv=3, random_state=None):
+ return(X)
+ def fit(self, X, y, groups=None):
+
"""
- Main fit method for the train object. Performs fitting, hyperparameter
- search and cross-validation in one step (inspired from R's CARET)
+ Main fit method for the train object
Args
----
- param_distributions: continuous parameter distribution to be used in a
- randomizedCVsearch
- param_grid: Dist of non-continuous parameters to grid search
- n_iter: Number of randomized search iterations
- cv: Number of cross-validation folds for parameter tuning
- random_state: seed to be used during random number generation
+ X, y: training data and labels as numpy arrays
+ groups: groups to be used for cross-validation
"""
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
- from sklearn.model_selection import GroupKFold
- # Balance classes
- if self.balance is True:
- X, y = self.__random_oversampling(
- self.X, self.y, random_state=random_state)
+ # Balance classes prior to fitting
+ if self.sampling is not None:
+ # balance samples
+ y_original = deepcopy(y)
+ X, y = self.sampling.fit_sample(X, y)
- if self.groups is not None:
- groups, _ = self.__random_oversampling(
- self.groups, self.y, random_state=random_state)
- else:
- groups = None
- else:
- X = self.X
- y = self.y
- groups = self.groups
+ # balance groups if present
+ if groups is not None:
+ groups, _ = self.sampling.fit_sample(
+ groups.reshape(-1, 1), y_original)
- # Randomized or grid search
- if param_distributions is not None or param_grid is not None:
+ if self.preprocessing is not None:
+ X = self.__preprocessor(X)
- # use groupkfold for hyperparameter search if groups are present
- if self.groups is not None:
- cv_search = GroupKFold(n_splits=cv)
- else:
- cv_search = cv
+ if self.categorical_var is not None:
+ X = self.__onehotencode(X)
- # Randomized search
- if param_distributions is not None:
- self.estimator = RandomizedSearchCV(
- estimator=self.estimator,
- param_distributions=param_distributions,
- n_iter=n_iter, scoring=scoring,
- cv=cv_search)
+ # fit the model on the training data and predict the test data
+ # need the groups parameter because the estimator can be a
+ # RandomizedSearchCV or GridSearchCV estimator where cv=GroupKFold
+ if isinstance(self.estimator, RandomizedSearchCV) \
+ or isinstance(self.estimator, GridSearchCV):
+ param_search = True
+ else:
+ param_search = False
- # Grid Search
- if param_grid is not None:
- self.estimator = GridSearchCV(self.estimator,
- param_grid,
- n_jobs=-1, cv=cv_search,
- scoring=scoring)
-
- # if groups then fit RandomizedSearchCV.fit requires groups param
- if self.groups is None:
- self.estimator.fit(X, y)
- else:
- self.estimator.fit(X, y, groups=groups)
-
- # Fitting without parameter search
+ if groups is not None and param_search is True:
+ self.estimator.fit(X, y, groups=groups)
else:
self.estimator.fit(X, y)
- def standardization(self):
+ def __preprocessor(self, X):
"""
Transforms the non-categorical X
+
+ Args
+ ----
+ X; 2D numpy array to transform
"""
- from sklearn.preprocessing import StandardScaler
-
# create mask so that indices that represent categorical
# predictors are not selected
if self.categorical_var is not None:
- idx = np.arange(self.X.shape[1])
+ idx = np.arange(X.shape[1])
mask = np.ones(len(idx), dtype=bool)
mask[self.categorical_var] = False
else:
- mask = np.arange(self.X.shape[1])
+ mask = np.arange(X.shape[1])
- X_continuous = self.X[:, mask]
- self.scaler = StandardScaler()
- self.scaler.fit(X_continuous)
- self.X[:, mask] = self.scaler.transform(X_continuous)
+ X_continuous = X[:, mask]
+ self.preprocessing.fit(X=X_continuous)
+ X[:, mask] = self.preprocessing.transform(X_continuous)
- def __pred_func(self, estimator, X_test, y_true, scorers):
- """
- Calculates a single performance metric depending on if scorer type
- is binary, multiclass or regression
+ return(X)
- To be called from the varImp_permutation
+ def varImp_permutation(self, estimator, X_test, y_true,
+ n_permutations, scorers,
+ random_state):
- Args
- ----
- estimator: fitted estimator on training set
- X_test: Test training data
- y_true: Test labelled data
- scorers: String indicating which type of scorer to be used
"""
-
- from sklearn import metrics
-
- if scorers == 'binary':
- scorer = metrics.roc_auc_score
- y_pred = estimator.predict_proba(X_test)[:, 1]
- if scorers == 'multiclass':
- scorer = metrics.accuracy_score
- y_pred = estimator.predict(X_test)
- if scorers == 'regression':
- scorer = metrics.r2_score
- y_pred = estimator.predict(X_test)
-
- score = scorer(y_true, y_pred)
-
- return (score)
-
- def __varImp_permutation(self, estimator, X_test, y_true,
- n_permutations, scorers,
- random_state):
-
- """
Method to perform permutation-based feature importance during
cross-validation (cross-validation is applied externally to this
method)
@@ -272,11 +230,19 @@
scores: AUC scores for each predictor following permutation
"""
+ from sklearn import metrics
+ if scorers == 'binary' or scorers == 'multiclass':
+ scorer = metrics.accuracy_score
+ if scorers == 'regression':
+ scorer = metrics.r2_score
+
# calculate score on original variables without permutation
# determine best metric type for binary/multiclass/regression scenarios
- best_score = self.__pred_func(estimator, X_test, y_true, scorers)
+ y_pred = estimator.predict(X_test)
+ best_score = scorer(y_true, y_pred)
np.random.seed(seed=random_state)
+ rstate = RandomState(random_state)
scores = np.zeros((n_permutations, X_test.shape[1]))
# outer loop to repeat the pemutation rep times
@@ -286,11 +252,11 @@
# difference in auc
for i in range(X_test.shape[1]):
Xscram = np.copy(X_test)
- Xscram[:, i] = np.random.choice(X_test[:, i], X_test.shape[0])
+ Xscram[:, i] = rstate.choice(X_test[:, i], X_test.shape[0])
# fit the model on the training data and predict the test data
- scores[rep, i] = best_score-self.__pred_func(
- estimator, Xscram, y_true, scorers)
+ y_pred = estimator.predict(Xscram)
+ scores[rep, i] = best_score-scorer(y_true, y_pred)
if scores[rep, i] < 0:
scores[rep, i] = 0
@@ -301,6 +267,19 @@
def specificity_score(self, y_true, y_pred):
+ """
+ Simple method to calculate specificity score
+
+ Args
+ ----
+ y_true: 1D numpy array of truth values
+ y_pred: 1D numpy array of predicted classes
+
+ Returns
+ -------
+ specificity: specificity score
+ """
+
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
@@ -314,11 +293,10 @@
return (specificity)
- def cross_val(self, scorers='binary', cv=3, feature_importances=False,
- n_permutations=25, random_state=None):
+ def cross_val(self, splitter, X, y, groups=None, scorers='binary',
+ feature_importances=False, n_permutations=25,
+ random_state=None):
- from sklearn.model_selection import StratifiedKFold
- from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics
@@ -330,17 +308,33 @@
Args
----
- scorers: Suite of performance metrics to use
- cv: Integer of cross-validation folds
+ splitter: Scikit learn model_selection object, e.g. StratifiedKFold
+ X, y: 2D numpy array of training data and 1D array of labels
+ groups: 1D numpy array of groups to be used for cross-validation
+ scorers: String specifying suite of performance metrics to use
feature_importances: Boolean to perform permutation-based importances
n_permutations: Number of permutations during feature importance
random_state: Seed to pass to the random number generator
"""
+ # preprocessing -------------------------------------------------------
+ if self.preprocessing is not None:
+ X = self.__preprocessor(X)
+
+ if self.categorical_var is not None:
+ X = self.__onehotencode(X)
+
# create copy of fitting estimator for cross-val fitting
fit_train = deepcopy(self.estimator)
- # dictionary of lists to store metrics
+ # create dictionary of lists to store metrics -------------------------
+ n_classes = len(np.unique(y))
+
+ if scorers == 'accuracy':
+ self.scores = {
+ 'accuracy': []
+ }
+
if scorers == 'binary':
self.scores = {
'accuracy': [],
@@ -355,77 +349,89 @@
if scorers == 'multiclass':
self.scores = {
'accuracy': [],
- 'f1': [],
- 'kappa': []
- }
+ 'kappa': [],
+ 'precision': np.zeros((0, n_classes)), # scores per sample
+ 'recall': np.zeros((0, n_classes)),
+ 'f1': np.zeros((0, n_classes))
+ }
if scorers == 'regression':
self.scores = {
'r2': []
}
- y_test_agg = []
- y_pred_agg = []
- self.fimp = np.zeros((cv, self.X.shape[1]))
+ self.mean_tpr = 0
+ self.mean_fpr = np.linspace(0, 1, 100)
- # generate Kfold indices
- if self.groups is None:
- k_fold = StratifiedKFold(
- n_splits=cv,
- shuffle=False,
- random_state=random_state).split(self.X, self.y)
+ # create np array to store feature importance scores
+ # for each predictor per fold
+ if feature_importances is True:
+ self.fimp = np.zeros((splitter.get_n_splits(), X.shape[1]))
+ self.fimp[:] = np.nan
+
+ # generate Kfold indices ----------------------------------------------
+
+ if groups is None:
+ k_fold = splitter.split(X, y)
else:
- k_fold = GroupKFold(n_splits=cv).split(
- self.X, self.y, groups=self.groups)
+ k_fold = splitter.split(
+ X, y, groups=groups)
+ # train on k-1 folds and test of k folds ------------------------------
+
for train_indices, test_indices in k_fold:
# get indices for train and test partitions
- X_train, X_test = self.X[train_indices], self.X[test_indices]
- y_train, y_test = self.y[train_indices], self.y[test_indices]
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+ if groups is not None:
+ groups_train = groups[train_indices]
# balance the fold
- if self.balance is True:
- X_train, y_train = self.__random_oversampling(
- X_train, y_train, random_state=random_state)
- if self.groups is not None:
- groups_train = self.groups[train_indices]
- groups_train, _ = self.__random_oversampling(
- groups_train, self.y[train_indices],
- random_state=random_state)
+ if self.sampling is not None:
+ y_train_original = deepcopy(y_train)
+ X_train, y_train = self.sampling.fit_sample(
+ X_train, y_train)
+ if groups is not None:
+ groups_train, _ = self.sampling.fit_sample(
+ groups_train.reshape(-1, 1), y_train_original)
+
else:
# also get indices of groups for the training partition
- if self.groups is not None:
- groups_train = self.groups[train_indices]
+ if groups is not None:
+ groups_train = groups[train_indices]
# fit the model on the training data and predict the test data
# need the groups parameter because the estimator can be a
- # RandomizedSearchCV estimator where cv=GroupKFold
- if isinstance(self.estimator, RandomizedSearchCV) is True \
- or isinstance(self.estimator, GridSearchCV):
+ # RandomizedSearchCV or GridSearchCV estimator where cv=GroupKFold
+ if isinstance(fit_train, RandomizedSearchCV) is True \
+ or isinstance(fit_train, GridSearchCV):
param_search = True
else:
param_search = False
- if self.groups is not None and param_search is True:
+ # train fit_train on training fold
+ if groups is not None and param_search is True:
fit_train.fit(X_train, y_train, groups=groups_train)
else:
fit_train.fit(X_train, y_train)
+ # prediction of test fold
y_pred = fit_train.predict(X_test)
-
- y_test_agg = np.append(y_test_agg, y_test)
- y_pred_agg = np.append(y_pred_agg, y_pred)
-
labels = np.unique(y_pred)
# calculate metrics
- if scorers == 'binary':
+ if scorers == 'accuracy':
self.scores['accuracy'] = np.append(
self.scores['accuracy'],
metrics.accuracy_score(y_test, y_pred))
+ elif scorers == 'binary':
+ self.scores['accuracy'] = np.append(
+ self.scores['accuracy'],
+ metrics.accuracy_score(y_test, y_pred))
+
y_pred_proba = fit_train.predict_proba(X_test)[:, 1]
self.scores['auc'] = np.append(
self.scores['auc'],
@@ -451,8 +457,9 @@
self.scores['kappa'],
metrics.cohen_kappa_score(y_test, y_pred))
- self.scores_cm = metrics.classification_report(
- y_test_agg, y_pred_agg)
+ fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba)
+ self.mean_tpr += scipy.interp(self.mean_fpr, fpr, tpr)
+ self.mean_tpr[0] = 0.0
elif scorers == 'multiclass':
@@ -464,33 +471,47 @@
self.scores['kappa'],
metrics.cohen_kappa_score(y_test, y_pred))
- self.scores_cm = metrics.classification_report(
- y_test_agg, y_pred_agg)
+ self.scores['precision'] = np.vstack((
+ self.scores['precision'],
+ np.array(metrics.precision_score(
+ y_test, y_pred, average=None))))
+ self.scores['recall'] = np.vstack((
+ self.scores['recall'],
+ np.array(metrics.recall_score(
+ y_test, y_pred, average=None))))
+
+ self.scores['f1'] = np.vstack((
+ self.scores['f1'],
+ np.array(metrics.f1_score(
+ y_test, y_pred, average=None))))
+
elif scorers == 'regression':
self.scores['r2'] = np.append(
self.scores['r2'], metrics.r2_score(y_test, y_pred))
# feature importances using permutation
if feature_importances is True:
- if bool((self.fimp == 0).all()) is True:
- self.fimp = self.__varImp_permutation(
+ if bool((np.isnan(self.fimp)).all()) is True:
+ self.fimp = self.varImp_permutation(
fit_train, X_test, y_test, n_permutations, scorers,
random_state)
else:
self.fimp = np.row_stack(
- (self.fimp, self.__varImp_permutation(
+ (self.fimp, self.varImp_permutation(
fit_train, X_test, y_test,
n_permutations, scorers, random_state)))
+ # summarize data ------------------------------------------------------
+
# convert onehot-encoded feature importances back to original vars
if self.fimp is not None and self.enc is not None:
# get start,end positions of each suite of onehot-encoded vars
feature_ranges = deepcopy(self.enc.feature_indices_)
for i in range(0, len(self.enc.feature_indices_)-1):
- feature_ranges[i+1] =\
- feature_ranges[i] + len(self.category_values[i])
+ feature_ranges[i+1] = feature_ranges[i] + \
+ len(self.category_values[i])
# take sum of each onehot-encoded feature
ohe_feature = [0] * len(self.categorical_var)
@@ -498,7 +519,7 @@
for i in range(len(self.categorical_var)):
ohe_feature[i] = \
- self.fimp[:, feature_ranges[i]:feature_ranges[i+1]]
+ self.fimp[:, feature_ranges[i]:feature_ranges[i+1]]
ohe_sum[i] = ohe_feature[i].sum(axis=1)
# remove onehot-encoded features from the importances array
@@ -507,12 +528,22 @@
# insert summed importances into original positions
for index in self.categorical_var:
- self.fimp = np.insert(
- self.fimp, np.array(index), ohe_sum[0], axis=1)
+ self.fimp = np.insert(self.fimp, np.array(index),
+ ohe_sum[0], axis=1)
- def predict(self, predictors, output, class_probabilities=False,
- rowincr=25):
+ if scorers == 'binary':
+ self.mean_tpr /= splitter.get_n_splits(X, y)
+ self.mean_tpr[-1] = 1.0
+ def get_roc_curve(self):
+ return (self.mean_tpr, self.mean_fpr)
+
+ def get_cross_val_scores(self):
+ return (self.scores)
+
+ def predict(self, predictors, output, labels=None,
+ class_probabilities=False, rowincr=25):
+
"""
Prediction on list of GRASS rasters using a fitted scikit learn model
@@ -527,9 +558,7 @@
"""
# determine output data type and nodata
- predicted = self.estimator.predict(self.X)
-
- if bool((predicted % 1 == 0).all()) is True:
+ if labels is not None:
ftype = 'CELL'
nodata = -2147483648
else:
@@ -568,7 +597,6 @@
if class_probabilities is True:
# determine number of classes
- labels = np.unique(self.y)
nclasses = len(labels)
prob_out_raster = [0] * nclasses
@@ -583,118 +611,122 @@
"""
Prediction using row blocks
"""
+ try:
+ for rowblock in range(0, current.rows, rowincr):
+ grass.percent(rowblock, current.rows, rowincr)
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
+ mask_np_row = np.zeros((rowincr, current.cols))
- for rowblock in range(0, current.rows, rowincr):
+ # loop through each row, and each band
+ # and add these values to the 2D array img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
- # check that the row increment does not exceed the number of rows
- if rowblock+rowincr > current.rows:
- rowincr = current.rows - rowblock
- img_np_row = np.zeros((rowincr, current.cols, n_features))
- mask_np_row = np.zeros((rowincr, current.cols))
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
- # loop through each row, and each band
- # and add these values to the 2D array img_np_row
- for row in range(rowblock, rowblock+rowincr, 1):
- mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
+ mask_np_row[mask_np_row == -2147483648] = np.nan
+ nanmask = np.isnan(mask_np_row) # True in mask means invalid data
- for band in range(n_features):
- img_np_row[row-rowblock, :, band] = \
- np.array(rasstack[band][row])
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
- mask_np_row[mask_np_row == -2147483648] = np.nan
- nanmask = np.isnan(mask_np_row) # True in mask means invalid data
+ # remove NaN values and GRASS CELL nodata vals
+ flat_pixels[flat_pixels == -2147483648] = np.nan
+ flat_pixels = np.nan_to_num(flat_pixels)
- # reshape each row-band matrix into a n*m array
- nsamples = rowincr * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
+ # rescale
+ if self.preprocessing is not None:
+ # create mask so that indices that represent categorical
+ # predictors are not selected
+ if self.categorical_var is not None:
+ idx = np.arange(n_features)
+ mask = np.ones(len(idx), dtype=bool)
+ mask[self.categorical_var] = False
+ else:
+ mask = np.arange(n_features)
+ flat_pixels_continuous = flat_pixels[:, mask]
+ flat_pixels[:, mask] = self.preprocessing.transform(
+ flat_pixels_continuous)
- # remove NaN values and GRASS CELL nodata vals
- flat_pixels[flat_pixels == -2147483648] = np.nan
- flat_pixels = np.nan_to_num(flat_pixels)
+ # onehot-encoding
+ if self.enc is not None:
+ try:
+ flat_pixels = self.enc.transform(flat_pixels)
+ except:
+ # if this fails it is because the onehot-encoder was fitted
+ # on the training samples, but the prediction data contains
+ # new values, i.e. the training data has not sampled all of
+ # categories
+ grass.fatal('There are values in the categorical rasters '
+ 'that are not present in the training data '
+ 'set, i.e. the training data has not sampled '
+ 'all of the categories')
- # onehot-encoding
- if self.enc is not None:
- try:
- flat_pixels = self.enc.transform(flat_pixels)
- except:
- # if this fails it is because the onehot-encoder was fitted
- # on the training samples, but the prediction data contains
- # new values, i.e. the training data has not sampled all of
- # categories
- grass.fatal('There are values in the categorical rasters '
- 'that are not present in the training data '
- 'set, i.e. the training data has not sampled '
- 'all of the categories')
+ # perform prediction
+ result = self.estimator.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
- # rescale
- if self.scaler is not None:
- # create mask so that indices that represent categorical
- # predictors are not selected
- if self.categorical_var is not None:
- idx = np.arange(self.X.shape[1])
- mask = np.ones(len(idx), dtype=bool)
- mask[self.categorical_var] = False
- else:
- mask = np.arange(self.X.shape[1])
- flat_pixels_continuous = flat_pixels[:, mask]
- flat_pixels[:, mask] = self.scaler.transform(
- flat_pixels_continuous)
+ # replace NaN values so that the prediction does not have a border
+ result = np.ma.masked_array(
+ result, mask=nanmask, fill_value=-99999)
- # perform prediction
- result = self.estimator.predict(flat_pixels)
- result = result.reshape((rowincr, current.cols))
+ # return a copy of result, with masked values filled with a value
+ result = result.filled([nodata])
- # replace NaN values so that the prediction does not have a border
- result = np.ma.masked_array(
- result, mask=nanmask, fill_value=-99999)
+ # for each row we can perform computation, and write the result
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
- # return a copy of result, with masked values filled with a value
- result = result.filled([nodata])
+ # same for probabilities
+ if class_probabilities is True:
+ result_proba = self.estimator.predict_proba(flat_pixels)
- # for each row we can perform computation, and write the result
- for row in range(rowincr):
- newrow = Buffer((result.shape[1],), mtype=ftype)
- newrow[:] = result[row, :]
- classification.put_row(newrow)
+ for iclass in range(result_proba.shape[1]):
- # same for probabilities
- if class_probabilities is True:
- result_proba = self.estimator.predict_proba(flat_pixels)
+ result_proba_class = result_proba[:, iclass]
+ result_proba_class = result_proba_class.reshape(
+ (rowincr, current.cols))
- for iclass in range(result_proba.shape[1]):
+ result_proba_class = np.ma.masked_array(
+ result_proba_class, mask=nanmask, fill_value=np.nan)
- result_proba_class = result_proba[:, iclass]
- result_proba_class = result_proba_class.reshape(
- (rowincr, current.cols))
+ result_proba_class = result_proba_class.filled([np.nan])
- result_proba_class = np.ma.masked_array(
- result_proba_class, mask=nanmask, fill_value=np.nan)
+ for row in range(rowincr):
+ newrow = Buffer((
+ result_proba_class.shape[1],),
+ mtype='FCELL')
- result_proba_class = result_proba_class.filled([np.nan])
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+ finally:
+ # close all predictors
+ for i in range(n_features):
+ rasstack[i].close()
- for row in range(rowincr):
+ # close classification and mask maps
+ classification.close()
+ mask_raster.close()
- newrow = Buffer((
- result_proba_class.shape[1],),
- mtype='FCELL')
+ grass.run_command("g.remove", name='tmp_clfmask',
+ flags="f", type="raster", quiet=True)
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
+ # close all class probability maps
+ try:
+ for iclass in range(nclasses):
+ prob[iclass].close()
+ except:
+ pass
- # close all maps
- for i in range(n_features):
- rasstack[i].close()
- classification.close()
- mask_raster.close()
-
- try:
- for iclass in range(nclasses):
- prob[iclass].close()
- except:
- pass
-
-
def model_classifiers(estimator='LogisticRegression', random_state=None,
C=1, max_depth=None, max_features='auto',
min_samples_split=2, min_samples_leaf=1,
More information about the grass-commit
mailing list