[GRASS-SVN] r70499 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Tue Feb 7 11:55:18 PST 2017
Author: spawley
Date: 2017-02-07 11:55:18 -0800 (Tue, 07 Feb 2017)
New Revision: 70499
Added:
grass-addons/grass7/raster/r.learn.ml/raster_learning.py
Modified:
grass-addons/grass7/raster/r.learn.ml/Makefile
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
Log:
r.learn.ml moved core classes and functions into separate module that can be imported as part of a pygrass session
Modified: grass-addons/grass7/raster/r.learn.ml/Makefile
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/Makefile 2017-02-07 19:49:48 UTC (rev 70498)
+++ grass-addons/grass7/raster/r.learn.ml/Makefile 2017-02-07 19:55:18 UTC (rev 70499)
@@ -2,6 +2,9 @@
PGM = r.learn.ml
+ETCFILES = raster_learning
+
include $(MODULE_TOPDIR)/include/Make/Script.make
+include $(MODULE_TOPDIR)/include/Make/Python.make
default: script
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-02-07 19:49:48 UTC (rev 70498)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-02-07 19:55:18 UTC (rev 70499)
@@ -299,697 +299,18 @@
#%end
import atexit
-import os
import numpy as np
import grass.script as grass
-import tempfile
from copy import deepcopy
-from grass.pygrass.modules.shortcuts import imagery as im
from grass.pygrass.modules.shortcuts import raster as r
-from subprocess import PIPE
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.utils import set_path
+set_path('r.learn.ml')
+from raster_learning import train, model_classifiers
+from raster_learning import save_training_data, load_training_data
+from raster_learning import extract, maps_from_group
-class train():
- def __init__(self, estimator, X, y, groups=None, categorical_var=None,
- standardize=False, balance=False):
- """
- Train class to perform preprocessing, fitting, parameter search and
- cross-validation in a single step
-
- Args
- ----
- estimator: Scikit-learn compatible estimator object
- X, y: training data and labels as numpy arrays
- groups: groups to be used for cross-validation
- categorical_var: 1D list containing indices of categorical predictors
- standardize: Transform predictors
- balance: boolean to balance number of classes
- """
-
- # fitting data
- self.estimator = estimator
- self.X = X
- self.y = y
- self.groups = groups
- self.balance = balance
-
- # for onehot-encoding
- self.enc = None
- self.categorical_var = categorical_var
- self.category_values = None
-
- if self.categorical_var:
- self.onehotencode()
-
- # for standardization
- if standardize is True:
- self.standardization()
- else:
- self.scaler = None
-
- # for cross-validation scores
- self.scores = None
- self.scores_cm = None
- self.fimp = None
-
- def random_oversampling(self, X, y, random_state=None):
- """
- Balances X, y observations using simple oversampling
-
- Args
- ----
- X: numpy array of training data
- y: 1D numpy array of response data
- random_state: Seed to pass onto random number generator
-
- Returns
- -------
- X_resampled: Numpy array of resampled training data
- y_resampled: Numpy array of resampled response data
- """
-
- np.random.seed(seed=random_state)
-
- # count the number of observations per class
- y_classes = np.unique(y)
- class_counts = np.histogram(y, bins=len(y_classes))[0]
- maj_counts = class_counts.max()
-
- y_resampled = y
- X_resampled = X
-
- for cla, counts in zip(y_classes, class_counts):
- # get the number of samples needed to balance minority class
- num_samples = maj_counts - counts
-
- # get the indices of the ith class
- indx = np.nonzero(y == cla)
-
- # create some new indices
- oversamp_indx = np.random.choice(indx[0], size=num_samples)
-
- # concatenate to the original X and y
- y_resampled = np.concatenate((y[oversamp_indx], y_resampled))
- X_resampled = np.concatenate((X[oversamp_indx], X_resampled))
-
- return (X_resampled, y_resampled)
-
- def onehotencode(self):
- """
- Method to convert a list of categorical arrays in X into a suite of
- binary predictors which are added to the left of the array
- """
-
- from sklearn.preprocessing import OneHotEncoder
-
- # store original range of values
- self.category_values = [0] * len(self.categorical_var)
- for i, cat in enumerate(self.categorical_var):
- self.category_values[i] = np.unique(self.X[:, cat])
-
- # fit and transform categorical grids to a suite of binary features
- self.enc = OneHotEncoder(categorical_features=self.categorical_var,
- sparse=False)
- self.enc.fit(self.X)
- self.X = self.enc.transform(self.X)
-
- def fit(self, param_distributions=None, param_grid=None,
- scoring=None, n_iter=3, cv=3, random_state=None):
-
- """
- Main fit method for the train object. Performs fitting, hyperparameter
- search and cross-validation in one step (inspired from R's CARET)
-
- Args
- ----
- param_distributions: continuous parameter distribution to be used in a
- randomizedCVsearch
- param_grid: Dist of non-continuous parameters to grid search
- n_iter: Number of randomized search iterations
- cv: Number of cross-validation folds for parameter tuning
- random_state: seed to be used during random number generation
- """
-
- from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
- from sklearn.model_selection import GroupKFold
-
- # Balance classes
- if self.balance is True:
- X, y = self.random_oversampling(
- self.X, self.y, random_state=random_state)
-
- if self.groups is not None:
- groups, _ = self.random_oversampling(
- self.groups, self.y, random_state=random_state)
- else:
- groups = None
- else:
- X = self.X
- y = self.y
- groups = self.groups
-
- # Randomized or grid search
- if param_distributions is not None or param_grid is not None:
-
- # use groupkfold for hyperparameter search if groups are present
- if self.groups is not None:
- cv_search = GroupKFold(n_splits=cv)
- else:
- cv_search = cv
-
- # Randomized search
- if param_distributions is not None:
- self.estimator = RandomizedSearchCV(
- estimator=self.estimator,
- param_distributions=param_distributions,
- n_iter=n_iter, scoring=scoring,
- cv=cv_search)
-
- # Grid Search
- if param_grid is not None:
- self.estimator = GridSearchCV(self.estimator,
- param_grid,
- n_jobs=-1, cv=cv_search,
- scoring=scoring)
-
- # if groups then fit RandomizedSearchCV.fit requires groups param
- if self.groups is None:
- self.estimator.fit(X, y)
- else:
- self.estimator.fit(X, y, groups=groups)
-
- # Fitting without parameter search
- else:
- self.estimator.fit(X, y)
-
- def standardization(self):
- """
- Transforms the non-categorical X
- """
-
- from sklearn.preprocessing import StandardScaler
-
- # create mask so that indices that represent categorical
- # predictors are not selected
- if self.categorical_var is not None:
- idx = np.arange(self.X.shape[1])
- mask = np.ones(len(idx), dtype=bool)
- mask[self.categorical_var] = False
- else:
- mask = np.arange(self.X.shape[1])
-
- X_continuous = self.X[:, mask]
- self.scaler = StandardScaler()
- self.scaler.fit(X_continuous)
- self.X[:, mask] = self.scaler.transform(X_continuous)
-
- def pred_func(self, estimator, X_test, y_true, scorers):
- """
- Calculates a single performance metric depending on if scorer type
- is binary, multiclass or regression
-
- To be called from the varImp_permutation
-
- Args
- ----
- estimator: fitted estimator on training set
- X_test: Test training data
- y_true: Test labelled data
- scorers: String indicating which type of scorer to be used
- """
-
- from sklearn import metrics
-
- if scorers == 'binary':
- scorer = metrics.roc_auc_score
- y_pred = estimator.predict_proba(X_test)[:, 1]
- if scorers == 'multiclass':
- scorer = metrics.accuracy_score
- y_pred = estimator.predict(X_test)
- if scorers == 'regression':
- scorer = metrics.r2_score
- y_pred = estimator.predict(X_test)
-
- score = scorer(y_true, y_pred)
-
- return (score)
-
- def varImp_permutation(self, estimator, X_test, y_true,
- n_permutations, scorers,
- random_state):
-
- """
- Method to perform permutation-based feature importance during
- cross-validation (cross-validation is applied externally to this
- method)
-
- Procedure is:
- 1. Pass fitted estimator and test partition X y
- 2. Assess AUC on the test partition (bestauc)
- 3. Permute each variable and assess the difference between bestauc and
- the messed-up variable
- 4. Repeat (3) for many random permutations
- 5. Average the repeats
-
- Args
- ----
- estimator: estimator that has been fitted to a training partition
- X_test, y_true: data and labels from a test partition
- n_permutations: number of random permutations to apply
- random_state: seed to pass to the numpy random.seed
-
- Returns
- -------
- scores: AUC scores for each predictor following permutation
- """
-
- # calculate score on original variables without permutation
- # determine best metric type for binary/multiclass/regression scenarios
- best_score = self.pred_func(estimator, X_test, y_true, scorers)
-
- np.random.seed(seed=random_state)
- scores = np.zeros((n_permutations, X_test.shape[1]))
-
- # outer loop to repeat the pemutation rep times
- for rep in range(n_permutations):
-
- # inner loop to permute each predictor variable and assess
- # difference in auc
- for i in range(X_test.shape[1]):
- Xscram = np.copy(X_test)
- Xscram[:, i] = np.random.choice(X_test[:, i], X_test.shape[0])
-
- # fit the model on the training data and predict the test data
- scores[rep, i] = best_score-self.pred_func(
- estimator, Xscram, y_true, scorers)
- if scores[rep, i] < 0:
- scores[rep, i] = 0
-
- # average the repetitions
- scores = scores.mean(axis=0)
-
- return(scores)
-
- def specificity_score(self, y_true, y_pred):
-
- from sklearn.metrics import confusion_matrix
-
- cm = confusion_matrix(y_true, y_pred)
-
- tn = float(cm[0][0])
- # fn = float(cm[1][0])
- # tp = float(cm[1][1])
- fp = float(cm[0][1])
-
- specificity = tn/(tn+fp)
-
- return (specificity)
-
- def cross_val(self, scorers='binary', cv=3, feature_importances=False,
- n_permutations=25, random_state=None):
-
- from sklearn.model_selection import StratifiedKFold
- from sklearn.model_selection import GroupKFold
- from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
- from sklearn import metrics
-
- """
- Stratified Kfold and GroupFold cross-validation
- Generates suites of scoring_metrics for binary classification,
-
- multiclass classification and regression scenarios
-
- Args
- ----
- scorers: Suite of performance metrics to use
- cv: Integer of cross-validation folds
- feature_importances: Boolean to perform permutation-based importances
- n_permutations: Number of permutations during feature importance
- random_state: Seed to pass to the random number generator
- """
-
- # create copy of fitting estimator for cross-val fitting
- fit_train = deepcopy(self.estimator)
-
- # dictionary of lists to store metrics
- if scorers == 'binary':
- self.scores = {
- 'accuracy': [],
- 'precision': [],
- 'recall': [],
- 'specificity': [],
- 'f1': [],
- 'kappa': [],
- 'auc': []
- }
-
- if scorers == 'multiclass':
- self.scores = {
- 'accuracy': [],
- 'f1': [],
- 'kappa': []
- }
-
- if scorers == 'regression':
- self.scores = {
- 'r2': []
- }
-
- y_test_agg = []
- y_pred_agg = []
- self.fimp = np.zeros((cv, self.X.shape[1]))
-
- # generate Kfold indices
- if self.groups is None:
- k_fold = StratifiedKFold(
- n_splits=cv,
- shuffle=False,
- random_state=random_state).split(self.X, self.y)
- else:
- k_fold = GroupKFold(n_splits=cv).split(
- self.X, self.y, groups=self.groups)
-
- for train_indices, test_indices in k_fold:
-
- # get indices for train and test partitions
- X_train, X_test = self.X[train_indices], self.X[test_indices]
- y_train, y_test = self.y[train_indices], self.y[test_indices]
-
- # balance the fold
- if self.balance is True:
- X_train, y_train = self.random_oversampling(
- X_train, y_train, random_state=random_state)
- if self.groups is not None:
- groups_train = self.groups[train_indices]
- groups_train, _ = self.random_oversampling(
- groups_train, self.y[train_indices],
- random_state=random_state)
-
- else:
- # also get indices of groups for the training partition
- if self.groups is not None:
- groups_train = self.groups[train_indices]
-
- # fit the model on the training data and predict the test data
- # need the groups parameter because the estimator can be a
- # RandomizedSearchCV estimator where cv=GroupKFold
- if isinstance(self.estimator, RandomizedSearchCV) is True \
- or isinstance(self.estimator, GridSearchCV):
- param_search = True
- else:
- param_search = False
-
- if self.groups is not None and param_search is True:
- fit_train.fit(X_train, y_train, groups=groups_train)
- else:
- fit_train.fit(X_train, y_train)
-
- y_pred = fit_train.predict(X_test)
-
- y_test_agg = np.append(y_test_agg, y_test)
- y_pred_agg = np.append(y_pred_agg, y_pred)
-
- labels = np.unique(y_pred)
-
- # calculate metrics
- if scorers == 'binary':
- self.scores['accuracy'] = np.append(
- self.scores['accuracy'],
- metrics.accuracy_score(y_test, y_pred))
-
- y_pred_proba = fit_train.predict_proba(X_test)[:, 1]
- self.scores['auc'] = np.append(
- self.scores['auc'],
- metrics.roc_auc_score(y_test, y_pred_proba))
-
- self.scores['precision'] = np.append(
- self.scores['precision'], metrics.precision_score(
- y_test, y_pred, labels, average='binary'))
-
- self.scores['recall'] = np.append(
- self.scores['recall'], metrics.recall_score(
- y_test, y_pred, labels, average='binary'))
-
- self.scores['specificity'] = np.append(
- self.scores['specificity'], self.specificity_score(
- y_test, y_pred))
-
- self.scores['f1'] = np.append(
- self.scores['f1'], metrics.f1_score(
- y_test, y_pred, labels, average='binary'))
-
- self.scores['kappa'] = np.append(
- self.scores['kappa'],
- metrics.cohen_kappa_score(y_test, y_pred))
-
- self.scores_cm = metrics.classification_report(
- y_test_agg, y_pred_agg)
-
- elif scorers == 'multiclass':
-
- self.scores['accuracy'] = np.append(
- self.scores['accuracy'],
- metrics.accuracy_score(y_test, y_pred))
-
- self.scores['kappa'] = np.append(
- self.scores['kappa'],
- metrics.cohen_kappa_score(y_test, y_pred))
-
- self.scores_cm = metrics.classification_report(
- y_test_agg, y_pred_agg)
-
- elif scorers == 'regression':
- self.scores['r2'] = np.append(
- self.scores['r2'], metrics.r2_score(y_test, y_pred))
-
- # feature importances using permutation
- if feature_importances is True:
- if bool((self.fimp == 0).all()) is True:
- self.fimp = self.varImp_permutation(
- fit_train, X_test, y_test, n_permutations, scorers,
- random_state)
- else:
- self.fimp = np.row_stack(
- (self.fimp, self.varImp_permutation(
- fit_train, X_test, y_test,
- n_permutations, scorers, random_state)))
-
- # convert onehot-encoded feature importances back to original vars
- if self.fimp is not None and self.enc is not None:
-
- # get start,end positions of each suite of onehot-encoded vars
- feature_ranges = deepcopy(self.enc.feature_indices_)
- for i in range(0, len(self.enc.feature_indices_)-1):
- feature_ranges[i+1] =\
- feature_ranges[i] + len(self.category_values[i])
-
- # take sum of each onehot-encoded feature
- ohe_feature = [0] * len(self.categorical_var)
- ohe_sum = [0] * len(self.categorical_var)
-
- for i in range(len(self.categorical_var)):
- ohe_feature[i] = \
- self.fimp[:, feature_ranges[i]:feature_ranges[i+1]]
- ohe_sum[i] = ohe_feature[i].sum(axis=1)
-
- # remove onehot-encoded features from the importances array
- features_for_removal = np.array(range(feature_ranges[-1]))
- self.fimp = np.delete(self.fimp, features_for_removal, axis=1)
-
- # insert summed importances into original positions
- for index in self.categorical_var:
- self.fimp = np.insert(
- self.fimp, np.array(index), ohe_sum[0], axis=1)
-
- def predict(self, predictors, output, class_probabilities=False,
- rowincr=25):
-
- """
- Prediction on list of GRASS rasters using a fitted scikit learn model
-
- Parameters
- ----------
- predictors: List of GRASS rasters
-
- class_probabilties: Predict class probabilities
-
- rowincr: Integer of raster rows to process at one time
- output: Name of GRASS raster to output classification results
- """
-
- # determine output data type and nodata
- predicted = self.estimator.predict(self.X)
-
- if bool((predicted % 1 == 0).all()) is True:
- ftype = 'CELL'
- nodata = -2147483648
- else:
- ftype = 'FCELL'
- nodata = np.nan
-
- # create a list of rasterrow objects for predictors
- n_features = len(predictors)
- rasstack = [0] * n_features
-
- for i in range(n_features):
- rasstack[i] = RasterRow(predictors[i])
- if rasstack[i].exist() is True:
- rasstack[i].open('r')
- else:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- current = Region()
-
- # create a imagery mask
- # the input rasters might have different dimensions and null pixels.
- # r.series used to automatically create a mask by propagating the nulls
- grass.run_command("r.series", output='tmp_clfmask',
- input=predictors, method='count', flags='n',
- overwrite=True)
-
- mask_raster = RasterRow('tmp_clfmask')
- mask_raster.open('r')
-
- # create and open RasterRow objects for classification
- classification = RasterRow(output)
- classification.open('w', ftype, overwrite=True)
-
- # create and open RasterRow objects for probabilities if enabled
- if class_probabilities is True:
-
- # determine number of classes
- labels = np.unique(self.y)
- nclasses = len(labels)
-
- prob_out_raster = [0] * nclasses
- prob = [0] * nclasses
-
- for iclass in range(nclasses):
- prob_out_raster[iclass] = output + \
- '_classPr' + str(int(labels[iclass]))
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
-
- """
- Prediction using row blocks
- """
-
- for rowblock in range(0, current.rows, rowincr):
-
- # check that the row increment does not exceed the number of rows
- if rowblock+rowincr > current.rows:
- rowincr = current.rows - rowblock
- img_np_row = np.zeros((rowincr, current.cols, n_features))
- mask_np_row = np.zeros((rowincr, current.cols))
-
- # loop through each row, and each band
- # and add these values to the 2D array img_np_row
- for row in range(rowblock, rowblock+rowincr, 1):
- mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
-
- for band in range(n_features):
- img_np_row[row-rowblock, :, band] = \
- np.array(rasstack[band][row])
-
- mask_np_row[mask_np_row == -2147483648] = np.nan
- nanmask = np.isnan(mask_np_row) # True in mask means invalid data
-
- # reshape each row-band matrix into a n*m array
- nsamples = rowincr * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
-
- # remove NaN values and GRASS CELL nodata vals
- flat_pixels[flat_pixels == -2147483648] = np.nan
- flat_pixels = np.nan_to_num(flat_pixels)
-
- # onehot-encoding
- if self.enc is not None:
- try:
- flat_pixels = self.enc.transform(flat_pixels)
- except:
- # if this fails it is because the onehot-encoder was fitted
- # on the training samples, but the prediction data contains
- # new values, i.e. the training data has not sampled all of
- # categories
- grass.fatal('There are values in the categorical rasters ',
- 'that are not present in the training data ',
- 'set, i.e. the training data has not sampled ',
- 'all of the categories')
-
- # rescale
- if self.scaler is not None:
- # create mask so that indices that represent categorical
- # predictors are not selected
- if self.categorical_var is not None:
- idx = np.arange(self.X.shape[1])
- mask = np.ones(len(idx), dtype=bool)
- mask[self.categorical_var] = False
- else:
- mask = np.arange(self.X.shape[1])
- flat_pixels_continuous = flat_pixels[:, mask]
- flat_pixels[:, mask] = self.scaler.transform(
- flat_pixels_continuous)
-
- # perform prediction
- result = self.estimator.predict(flat_pixels)
- result = result.reshape((rowincr, current.cols))
-
- # replace NaN values so that the prediction does not have a border
- result = np.ma.masked_array(
- result, mask=nanmask, fill_value=-99999)
-
- # return a copy of result, with masked values filled with a value
- result = result.filled([nodata])
-
- # for each row we can perform computation, and write the result
- for row in range(rowincr):
- newrow = Buffer((result.shape[1],), mtype=ftype)
- newrow[:] = result[row, :]
- classification.put_row(newrow)
-
- # same for probabilities
- if class_probabilities is True:
- result_proba = self.estimator.predict_proba(flat_pixels)
-
- for iclass in range(result_proba.shape[1]):
-
- result_proba_class = result_proba[:, iclass]
- result_proba_class = result_proba_class.reshape(
- (rowincr, current.cols))
-
- result_proba_class = np.ma.masked_array(
- result_proba_class, mask=nanmask, fill_value=np.nan)
-
- result_proba_class = result_proba_class.filled([np.nan])
-
- for row in range(rowincr):
-
- newrow = Buffer((
- result_proba_class.shape[1],),
- mtype='FCELL')
-
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
-
- # close all maps
- for i in range(n_features):
- rasstack[i].close()
-
- classification.close()
- mask_raster.close()
-
- try:
- for iclass in range(nclasses):
- prob[iclass].close()
- except:
- pass
-
-
def cleanup():
grass.run_command("g.remove", name='tmp_clfmask',
flags="f", type="raster", quiet=True)
@@ -997,348 +318,6 @@
flags="f", type="raster", quiet=True)
-def model_classifiers(estimator='LogisticRegression', random_state=None,
- C=1, max_depth=None, max_features='auto',
- min_samples_split=2, min_samples_leaf=1,
- n_estimators=100, subsample=1.0,
- learning_rate=0.1, max_degree=1):
-
- """
- Provides the classifiers and parameters using by the module
-
- Args
- ----
- estimator: Name of estimator
- random_state: Seed to use in randomized components
- C: Inverse of regularization strength
- max_depth: Maximum depth for tree-based methods
- min_samples_split: Minimum number of samples to split a node
- min_samples_leaf: Minimum number of samples to form a leaf
- n_estimators: Number of trees
- subsample: Controls randomization in gradient boosting
- learning_rate: Used in gradient boosting
- max_degree: For earth
-
- Returns
- -------
- clf: Scikit-learn classifier object
- mode: Flag to indicate whether classifier performs classification or
- regression
- """
-
- from sklearn.linear_model import LogisticRegression
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
- from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
- from sklearn.ensemble import GradientBoostingClassifier
- from sklearn.ensemble import GradientBoostingRegressor
- from sklearn.svm import SVC
-
- # optional packages that add additional classifiers here
- if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
- try:
- from sklearn.pipeline import Pipeline
- from pyearth import Earth
-
- earth_classifier = Pipeline([('Earth',
- Earth(max_degree=max_degree)),
- ('Logistic', LogisticRegression())])
-
- classifiers = {'EarthClassifier': earth_classifier,
- 'EarthRegressor': Earth(max_degree=max_degree)}
- except:
- grass.fatal('Py-earth package not installed')
-
- elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
- try:
- from xgboost import XGBClassifier, XGBRegressor
-
- if max_depth is None:
- max_depth = int(3)
-
- classifiers = {
- 'XGBClassifier':
- XGBClassifier(learning_rate=learning_rate,
- n_estimators=n_estimators,
- max_depth=max_depth,
- subsample=subsample),
- 'XGBRegressor':
- XGBRegressor(learning_rate=learning_rate,
- n_estimators=n_estimators,
- max_depth=max_depth,
- subsample=subsample)}
- except:
- grass.fatal('Py-earth package not installed')
- else:
- # core sklearn classifiers go here
- classifiers = {
- 'SVC': SVC(C=C, probability=True, random_state=random_state),
- 'LogisticRegression':
- LogisticRegression(C=C, random_state=random_state, n_jobs=-1,
- fit_intercept=True),
- 'DecisionTreeClassifier':
- DecisionTreeClassifier(max_depth=max_depth,
- max_features=max_features,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- random_state=random_state),
- 'DecisionTreeRegressor':
- DecisionTreeRegressor(max_features=max_features,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- random_state=random_state),
- 'RandomForestClassifier':
- RandomForestClassifier(n_estimators=n_estimators,
- max_features=max_features,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- random_state=random_state,
- n_jobs=-1,
- oob_score=False),
- 'RandomForestRegressor':
- RandomForestRegressor(n_estimators=n_estimators,
- max_features=max_features,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- random_state=random_state,
- n_jobs=-1,
- oob_score=False),
- 'GradientBoostingClassifier':
- GradientBoostingClassifier(learning_rate=learning_rate,
- n_estimators=n_estimators,
- max_depth=max_depth,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- subsample=subsample,
- max_features=max_features,
- random_state=random_state),
- 'GradientBoostingRegressor':
- GradientBoostingRegressor(learning_rate=learning_rate,
- n_estimators=n_estimators,
- max_depth=max_depth,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- subsample=subsample,
- max_features=max_features,
- random_state=random_state),
- 'GaussianNB': GaussianNB(),
- 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
- 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
- }
-
- # define classifier
- clf = classifiers[estimator]
-
- # classification or regression
- if estimator == 'LogisticRegression' \
- or estimator == 'DecisionTreeClassifier' \
- or estimator == 'RandomForestClassifier' \
- or estimator == 'GradientBoostingClassifier' \
- or estimator == 'GaussianNB' \
- or estimator == 'LinearDiscriminantAnalysis' \
- or estimator == 'QuadraticDiscriminantAnalysis' \
- or estimator == 'EarthClassifier' \
- or estimator == 'XGBClassifier' \
- or estimator == 'SVC':
- mode = 'classification'
- else:
- mode = 'regression'
-
- return (clf, mode)
-
-
-def save_training_data(X, y, groups, file):
-
- """
- Saves any extracted training data to a csv file
-
- Args
- ----
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels
- file: Path to a csv file to save data to
- """
-
- # if there are no group labels, create a nan filled array
- if groups is None:
- groups = np.empty((y.shape[0]))
- groups[:] = np.nan
-
- training_data = np.column_stack([X, y, groups])
- np.savetxt(file, training_data, delimiter=',')
-
-
-def load_training_data(file):
-
- """
- Loads training data and labels from a csv file
-
- Args
- ----
- file: Path to a csv file to save data to
-
- Returns
- -------
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels, or None
- """
-
- training_data = np.loadtxt(file, delimiter=',')
- n_cols = training_data.shape[1]
- last_Xcol = n_cols-2
-
- # check to see if last column contains group labels or nans
- groups = training_data[:, -1]
-
- # if all nans then set groups to None
- if bool(np.isnan(groups).all()) is True:
- groups = None
-
- # fetch X and y
- X = training_data[:, 0:last_Xcol]
- y = training_data[:, -2]
-
- return(X, y, groups)
-
-
-def sample_predictors(response, predictors, impute=False, shuffle_data=True,
- lowmem=False, random_state=None):
-
- from sklearn.utils import shuffle
- from sklearn.preprocessing import Imputer
-
- """
- Samples a list of GRASS rasters using a labelled raster
- Per raster sampling
-
- Args
- ----
- response: String; GRASS raster with labelled pixels
- predictors: List of GRASS rasters containing explanatory variables
-
- Returns
- -------
-
- training_data: Numpy array of extracted raster values
- training_labels: Numpy array of labels
- y_indexes: Row and Columns of label positions
- """
-
- current = Region()
-
- # open response raster as rasterrow and read as np array
- if RasterRow(response).exist() is True:
- roi_gr = RasterRow(response)
- roi_gr.open('r')
-
- if lowmem is False:
- response_np = np.array(roi_gr)
- else:
- response_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
- response_np[:] = np.array(roi_gr)[:]
- else:
- grass.fatal("GRASS response raster does not exist.... exiting")
-
- # determine number of predictor rasters
- n_features = len(predictors)
-
- # check to see if all predictors exist
- for i in range(n_features):
- if RasterRow(predictors[i]).exist() is not True:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan
- is_train = np.nonzero(response_np > -2147483648)
- training_labels = response_np[is_train]
- n_labels = np.array(is_train).shape[1]
-
- # Create a zero numpy array of len training labels
- if lowmem is False:
- training_data = np.zeros((n_labels, n_features))
- else:
- training_data = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(n_labels, n_features))
-
- # Loop through each raster and sample pixel values at training indexes
- if lowmem is True:
- feature_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
-
- for f in range(n_features):
- predictor_gr = RasterRow(predictors[f])
- predictor_gr.open('r')
-
- if lowmem is False:
- feature_np = np.array(predictor_gr)
- else:
- feature_np[:] = np.array(predictor_gr)[:]
-
- training_data[0:n_labels, f] = feature_np[is_train]
-
- # close each predictor map
- predictor_gr.close()
-
- # convert any CELL maps no datavals to NaN in the training data
- for i in range(n_features):
- training_data[training_data[:, i] == -2147483648] = np.nan
-
- # convert indexes of training pixels from tuple to n*2 np array
- is_train = np.array(is_train).T
-
- # impute missing values
- if impute is True:
- missing = Imputer(strategy='median')
- training_data = missing.fit_transform(training_data)
-
- # Remove nan rows from training data
- X = training_data[~np.isnan(training_data).any(axis=1)]
- y = training_labels[~np.isnan(training_data).any(axis=1)]
- y_indexes = is_train[~np.isnan(training_data).any(axis=1)]
-
- # shuffle the training data
- if shuffle_data is True:
- X, y, y_indexes = shuffle(X, y, y_indexes, random_state=random_state)
-
- # close the response map
- roi_gr.close()
-
- return(X, y, y_indexes)
-
-
-def maps_from_group(group):
- """
- Parse individual rasters into a list from an imagery group
-
- Args
- ----
- group: String; GRASS imagery group
- Returns
- -------
- maplist: Python list containing individual GRASS raster maps
- """
- groupmaps = im.group(group=group, flags="g",
- quiet=True, stdout_=PIPE).outputs.stdout
-
- maplist = groupmaps.split(os.linesep)
- maplist = maplist[0:len(maplist)-1]
- map_names = []
-
- for rastername in maplist:
- map_names.append(rastername.split('@')[0])
-
- return(maplist, map_names)
-
-
def main():
try:
@@ -1501,7 +480,7 @@
if group_raster != '':
maplist2 = deepcopy(maplist)
maplist2.append(group_raster)
- X, y, sample_coords = sample_predictors(
+ X, y, sample_coords = extract(
response=response, predictors=maplist2,
impute=impute, shuffle_data=False, lowmem=False,
random_state=random_state)
@@ -1521,7 +500,7 @@
else:
# extract training data from maplist without group Ids
# shuffle this data by default
- X, y, sample_coords = sample_predictors(
+ X, y, sample_coords = extract(
response=response, predictors=maplist,
impute=impute,
shuffle_data=True,
@@ -1540,8 +519,8 @@
# check for labelled pixels and training data
if y.shape[0] == 0 or X.shape[0] == 0:
- grass.fatal(('No training pixels or pixels in imagery group '
- '...check computational region'))
+ grass.fatal('No training pixels or pixels in imagery group '
+ '...check computational region')
# option to save extracted data to .csv file
if save_training != '':
@@ -1590,7 +569,7 @@
if mode == 'regression' and probability is True:
grass.warning(
- 'Class probabilities only valid for classifications...',
+ 'Class probabilities only valid for classifications...'
'ignoring')
probability = False
@@ -1681,8 +660,8 @@
errors = pd.DataFrame(learn_m.scores)
errors.to_csv(errors_file, mode='w')
except:
- grass.warning('Pandas is not installed. Pandas is ',
- 'required to write the cross-validation ',
+ grass.warning('Pandas is not installed. Pandas is '
+ 'required to write the cross-validation '
'results to file')
# feature importances
Added: grass-addons/grass7/raster/r.learn.ml/raster_learning.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/raster_learning.py (rev 0)
+++ grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-02-07 19:55:18 UTC (rev 70499)
@@ -0,0 +1,1038 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 7 09:03:10 2017
+
+ at author: steve
+"""
+
+import os
+import numpy as np
+from copy import deepcopy
+import tempfile
+import grass.script as grass
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.modules.shortcuts import imagery as im
+from subprocess import PIPE
+
+
+class train():
+
+ def __init__(self, estimator, X, y, groups=None, categorical_var=None,
+ standardize=False, balance=False):
+ """
+ Train class to perform preprocessing, fitting, parameter search and
+ cross-validation in a single step
+
+ Args
+ ----
+ estimator: Scikit-learn compatible estimator object
+ X, y: training data and labels as numpy arrays
+ groups: groups to be used for cross-validation
+ categorical_var: 1D list containing indices of categorical predictors
+ standardize: Transform predictors
+ balance: boolean to balance number of classes
+ """
+
+ # fitting data
+ self.estimator = estimator
+ self.X = X
+ self.y = y
+ self.groups = groups
+ self.balance = balance
+
+ # for onehot-encoding
+ self.enc = None
+ self.categorical_var = categorical_var
+ self.category_values = None
+
+ if self.categorical_var:
+ self.__onehotencode()
+
+ # for standardization
+ if standardize is True:
+ self.standardization()
+ else:
+ self.scaler = None
+
+ # for cross-validation scores
+ self.scores = None
+ self.scores_cm = None
+ self.fimp = None
+
+ def __random_oversampling(self, X, y, random_state=None):
+ """
+ Balances X, y observations using simple oversampling
+
+ Args
+ ----
+ X: numpy array of training data
+ y: 1D numpy array of response data
+ random_state: Seed to pass onto random number generator
+
+ Returns
+ -------
+ X_resampled: Numpy array of resampled training data
+ y_resampled: Numpy array of resampled response data
+ """
+
+ np.random.seed(seed=random_state)
+
+ # count the number of observations per class
+ y_classes = np.unique(y)
+ class_counts = np.histogram(y, bins=len(y_classes))[0]
+ maj_counts = class_counts.max()
+
+ y_resampled = y
+ X_resampled = X
+
+ for cla, counts in zip(y_classes, class_counts):
+ # get the number of samples needed to balance minority class
+ num_samples = maj_counts - counts
+
+ # get the indices of the ith class
+ indx = np.nonzero(y == cla)
+
+ # create some new indices
+ oversamp_indx = np.random.choice(indx[0], size=num_samples)
+
+ # concatenate to the original X and y
+ y_resampled = np.concatenate((y[oversamp_indx], y_resampled))
+ X_resampled = np.concatenate((X[oversamp_indx], X_resampled))
+
+ return (X_resampled, y_resampled)
+
+ def __onehotencode(self):
+ """
+ Method to convert a list of categorical arrays in X into a suite of
+ binary predictors which are added to the left of the array
+ """
+
+ from sklearn.preprocessing import OneHotEncoder
+
+ # store original range of values
+ self.category_values = [0] * len(self.categorical_var)
+ for i, cat in enumerate(self.categorical_var):
+ self.category_values[i] = np.unique(self.X[:, cat])
+
+ # fit and transform categorical grids to a suite of binary features
+ self.enc = OneHotEncoder(categorical_features=self.categorical_var,
+ sparse=False)
+ self.enc.fit(self.X)
+ self.X = self.enc.transform(self.X)
+
+ def fit(self, param_distributions=None, param_grid=None,
+ scoring=None, n_iter=3, cv=3, random_state=None):
+
+ """
+ Main fit method for the train object. Performs fitting, hyperparameter
+ search and cross-validation in one step (inspired from R's CARET)
+
+ Args
+ ----
+ param_distributions: continuous parameter distribution to be used in a
+ randomizedCVsearch
+ param_grid: Dist of non-continuous parameters to grid search
+ n_iter: Number of randomized search iterations
+ cv: Number of cross-validation folds for parameter tuning
+ random_state: seed to be used during random number generation
+ """
+
+ from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
+ from sklearn.model_selection import GroupKFold
+
+ # Balance classes
+ if self.balance is True:
+ X, y = self.__random_oversampling(
+ self.X, self.y, random_state=random_state)
+
+ if self.groups is not None:
+ groups, _ = self.__random_oversampling(
+ self.groups, self.y, random_state=random_state)
+ else:
+ groups = None
+ else:
+ X = self.X
+ y = self.y
+ groups = self.groups
+
+ # Randomized or grid search
+ if param_distributions is not None or param_grid is not None:
+
+ # use groupkfold for hyperparameter search if groups are present
+ if self.groups is not None:
+ cv_search = GroupKFold(n_splits=cv)
+ else:
+ cv_search = cv
+
+ # Randomized search
+ if param_distributions is not None:
+ self.estimator = RandomizedSearchCV(
+ estimator=self.estimator,
+ param_distributions=param_distributions,
+ n_iter=n_iter, scoring=scoring,
+ cv=cv_search)
+
+ # Grid Search
+ if param_grid is not None:
+ self.estimator = GridSearchCV(self.estimator,
+ param_grid,
+ n_jobs=-1, cv=cv_search,
+ scoring=scoring)
+
+ # if groups then fit RandomizedSearchCV.fit requires groups param
+ if self.groups is None:
+ self.estimator.fit(X, y)
+ else:
+ self.estimator.fit(X, y, groups=groups)
+
+ # Fitting without parameter search
+ else:
+ self.estimator.fit(X, y)
+
+ def standardization(self):
+ """
+ Transforms the non-categorical X
+ """
+
+ from sklearn.preprocessing import StandardScaler
+
+ # create mask so that indices that represent categorical
+ # predictors are not selected
+ if self.categorical_var is not None:
+ idx = np.arange(self.X.shape[1])
+ mask = np.ones(len(idx), dtype=bool)
+ mask[self.categorical_var] = False
+ else:
+ mask = np.arange(self.X.shape[1])
+
+ X_continuous = self.X[:, mask]
+ self.scaler = StandardScaler()
+ self.scaler.fit(X_continuous)
+ self.X[:, mask] = self.scaler.transform(X_continuous)
+
+ def __pred_func(self, estimator, X_test, y_true, scorers):
+ """
+ Calculates a single performance metric depending on if scorer type
+ is binary, multiclass or regression
+
+ To be called from the varImp_permutation
+
+ Args
+ ----
+ estimator: fitted estimator on training set
+ X_test: Test training data
+ y_true: Test labelled data
+ scorers: String indicating which type of scorer to be used
+ """
+
+ from sklearn import metrics
+
+ if scorers == 'binary':
+ scorer = metrics.roc_auc_score
+ y_pred = estimator.predict_proba(X_test)[:, 1]
+ if scorers == 'multiclass':
+ scorer = metrics.accuracy_score
+ y_pred = estimator.predict(X_test)
+ if scorers == 'regression':
+ scorer = metrics.r2_score
+ y_pred = estimator.predict(X_test)
+
+ score = scorer(y_true, y_pred)
+
+ return (score)
+
+ def __varImp_permutation(self, estimator, X_test, y_true,
+ n_permutations, scorers,
+ random_state):
+
+ """
+ Method to perform permutation-based feature importance during
+ cross-validation (cross-validation is applied externally to this
+ method)
+
+ Procedure is:
+ 1. Pass fitted estimator and test partition X y
+ 2. Assess AUC on the test partition (bestauc)
+ 3. Permute each variable and assess the difference between bestauc and
+ the messed-up variable
+ 4. Repeat (3) for many random permutations
+ 5. Average the repeats
+
+ Args
+ ----
+ estimator: estimator that has been fitted to a training partition
+ X_test, y_true: data and labels from a test partition
+ n_permutations: number of random permutations to apply
+ random_state: seed to pass to the numpy random.seed
+
+ Returns
+ -------
+ scores: AUC scores for each predictor following permutation
+ """
+
+ # calculate score on original variables without permutation
+ # determine best metric type for binary/multiclass/regression scenarios
+ best_score = self.__pred_func(estimator, X_test, y_true, scorers)
+
+ np.random.seed(seed=random_state)
+ scores = np.zeros((n_permutations, X_test.shape[1]))
+
+ # outer loop to repeat the pemutation rep times
+ for rep in range(n_permutations):
+
+ # inner loop to permute each predictor variable and assess
+ # difference in auc
+ for i in range(X_test.shape[1]):
+ Xscram = np.copy(X_test)
+ Xscram[:, i] = np.random.choice(X_test[:, i], X_test.shape[0])
+
+ # fit the model on the training data and predict the test data
+ scores[rep, i] = best_score-self.__pred_func(
+ estimator, Xscram, y_true, scorers)
+ if scores[rep, i] < 0:
+ scores[rep, i] = 0
+
+ # average the repetitions
+ scores = scores.mean(axis=0)
+
+ return(scores)
+
+ def specificity_score(self, y_true, y_pred):
+
+ from sklearn.metrics import confusion_matrix
+
+ cm = confusion_matrix(y_true, y_pred)
+
+ tn = float(cm[0][0])
+ # fn = float(cm[1][0])
+ # tp = float(cm[1][1])
+ fp = float(cm[0][1])
+
+ specificity = tn/(tn+fp)
+
+ return (specificity)
+
+ def cross_val(self, scorers='binary', cv=3, feature_importances=False,
+ n_permutations=25, random_state=None):
+
+ from sklearn.model_selection import StratifiedKFold
+ from sklearn.model_selection import GroupKFold
+ from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
+ from sklearn import metrics
+
+ """
+ Stratified Kfold and GroupFold cross-validation
+ Generates suites of scoring_metrics for binary classification,
+
+ multiclass classification and regression scenarios
+
+ Args
+ ----
+ scorers: Suite of performance metrics to use
+ cv: Integer of cross-validation folds
+ feature_importances: Boolean to perform permutation-based importances
+ n_permutations: Number of permutations during feature importance
+ random_state: Seed to pass to the random number generator
+ """
+
+ # create copy of fitting estimator for cross-val fitting
+ fit_train = deepcopy(self.estimator)
+
+ # dictionary of lists to store metrics
+ if scorers == 'binary':
+ self.scores = {
+ 'accuracy': [],
+ 'precision': [],
+ 'recall': [],
+ 'specificity': [],
+ 'f1': [],
+ 'kappa': [],
+ 'auc': []
+ }
+
+ if scorers == 'multiclass':
+ self.scores = {
+ 'accuracy': [],
+ 'f1': [],
+ 'kappa': []
+ }
+
+ if scorers == 'regression':
+ self.scores = {
+ 'r2': []
+ }
+
+ y_test_agg = []
+ y_pred_agg = []
+ self.fimp = np.zeros((cv, self.X.shape[1]))
+
+ # generate Kfold indices
+ if self.groups is None:
+ k_fold = StratifiedKFold(
+ n_splits=cv,
+ shuffle=False,
+ random_state=random_state).split(self.X, self.y)
+ else:
+ k_fold = GroupKFold(n_splits=cv).split(
+ self.X, self.y, groups=self.groups)
+
+ for train_indices, test_indices in k_fold:
+
+ # get indices for train and test partitions
+ X_train, X_test = self.X[train_indices], self.X[test_indices]
+ y_train, y_test = self.y[train_indices], self.y[test_indices]
+
+ # balance the fold
+ if self.balance is True:
+ X_train, y_train = self.__random_oversampling(
+ X_train, y_train, random_state=random_state)
+ if self.groups is not None:
+ groups_train = self.groups[train_indices]
+ groups_train, _ = self.__random_oversampling(
+ groups_train, self.y[train_indices],
+ random_state=random_state)
+
+ else:
+ # also get indices of groups for the training partition
+ if self.groups is not None:
+ groups_train = self.groups[train_indices]
+
+ # fit the model on the training data and predict the test data
+ # need the groups parameter because the estimator can be a
+ # RandomizedSearchCV estimator where cv=GroupKFold
+ if isinstance(self.estimator, RandomizedSearchCV) is True \
+ or isinstance(self.estimator, GridSearchCV):
+ param_search = True
+ else:
+ param_search = False
+
+ if self.groups is not None and param_search is True:
+ fit_train.fit(X_train, y_train, groups=groups_train)
+ else:
+ fit_train.fit(X_train, y_train)
+
+ y_pred = fit_train.predict(X_test)
+
+ y_test_agg = np.append(y_test_agg, y_test)
+ y_pred_agg = np.append(y_pred_agg, y_pred)
+
+ labels = np.unique(y_pred)
+
+ # calculate metrics
+ if scorers == 'binary':
+ self.scores['accuracy'] = np.append(
+ self.scores['accuracy'],
+ metrics.accuracy_score(y_test, y_pred))
+
+ y_pred_proba = fit_train.predict_proba(X_test)[:, 1]
+ self.scores['auc'] = np.append(
+ self.scores['auc'],
+ metrics.roc_auc_score(y_test, y_pred_proba))
+
+ self.scores['precision'] = np.append(
+ self.scores['precision'], metrics.precision_score(
+ y_test, y_pred, labels, average='binary'))
+
+ self.scores['recall'] = np.append(
+ self.scores['recall'], metrics.recall_score(
+ y_test, y_pred, labels, average='binary'))
+
+ self.scores['specificity'] = np.append(
+ self.scores['specificity'], self.specificity_score(
+ y_test, y_pred))
+
+ self.scores['f1'] = np.append(
+ self.scores['f1'], metrics.f1_score(
+ y_test, y_pred, labels, average='binary'))
+
+ self.scores['kappa'] = np.append(
+ self.scores['kappa'],
+ metrics.cohen_kappa_score(y_test, y_pred))
+
+ self.scores_cm = metrics.classification_report(
+ y_test_agg, y_pred_agg)
+
+ elif scorers == 'multiclass':
+
+ self.scores['accuracy'] = np.append(
+ self.scores['accuracy'],
+ metrics.accuracy_score(y_test, y_pred))
+
+ self.scores['kappa'] = np.append(
+ self.scores['kappa'],
+ metrics.cohen_kappa_score(y_test, y_pred))
+
+ self.scores_cm = metrics.classification_report(
+ y_test_agg, y_pred_agg)
+
+ elif scorers == 'regression':
+ self.scores['r2'] = np.append(
+ self.scores['r2'], metrics.r2_score(y_test, y_pred))
+
+ # feature importances using permutation
+ if feature_importances is True:
+ if bool((self.fimp == 0).all()) is True:
+ self.fimp = self.__varImp_permutation(
+ fit_train, X_test, y_test, n_permutations, scorers,
+ random_state)
+ else:
+ self.fimp = np.row_stack(
+ (self.fimp, self.__varImp_permutation(
+ fit_train, X_test, y_test,
+ n_permutations, scorers, random_state)))
+
+ # convert onehot-encoded feature importances back to original vars
+ if self.fimp is not None and self.enc is not None:
+
+ # get start,end positions of each suite of onehot-encoded vars
+ feature_ranges = deepcopy(self.enc.feature_indices_)
+ for i in range(0, len(self.enc.feature_indices_)-1):
+ feature_ranges[i+1] =\
+ feature_ranges[i] + len(self.category_values[i])
+
+ # take sum of each onehot-encoded feature
+ ohe_feature = [0] * len(self.categorical_var)
+ ohe_sum = [0] * len(self.categorical_var)
+
+ for i in range(len(self.categorical_var)):
+ ohe_feature[i] = \
+ self.fimp[:, feature_ranges[i]:feature_ranges[i+1]]
+ ohe_sum[i] = ohe_feature[i].sum(axis=1)
+
+ # remove onehot-encoded features from the importances array
+ features_for_removal = np.array(range(feature_ranges[-1]))
+ self.fimp = np.delete(self.fimp, features_for_removal, axis=1)
+
+ # insert summed importances into original positions
+ for index in self.categorical_var:
+ self.fimp = np.insert(
+ self.fimp, np.array(index), ohe_sum[0], axis=1)
+
+ def predict(self, predictors, output, class_probabilities=False,
+ rowincr=25):
+
+ """
+ Prediction on list of GRASS rasters using a fitted scikit learn model
+
+ Parameters
+ ----------
+ predictors: List of GRASS rasters
+
+ class_probabilties: Predict class probabilities
+
+ rowincr: Integer of raster rows to process at one time
+ output: Name of GRASS raster to output classification results
+ """
+
+ # determine output data type and nodata
+ predicted = self.estimator.predict(self.X)
+
+ if bool((predicted % 1 == 0).all()) is True:
+ ftype = 'CELL'
+ nodata = -2147483648
+ else:
+ ftype = 'FCELL'
+ nodata = np.nan
+
+ # create a list of rasterrow objects for predictors
+ n_features = len(predictors)
+ rasstack = [0] * n_features
+
+ for i in range(n_features):
+ rasstack[i] = RasterRow(predictors[i])
+ if rasstack[i].exist() is True:
+ rasstack[i].open('r')
+ else:
+ grass.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ current = Region()
+
+ # create a imagery mask
+ # the input rasters might have different dimensions and null pixels.
+ # r.series used to automatically create a mask by propagating the nulls
+ grass.run_command("r.series", output='tmp_clfmask',
+ input=predictors, method='count', flags='n',
+ overwrite=True)
+
+ mask_raster = RasterRow('tmp_clfmask')
+ mask_raster.open('r')
+
+ # create and open RasterRow objects for classification
+ classification = RasterRow(output)
+ classification.open('w', ftype, overwrite=True)
+
+ # create and open RasterRow objects for probabilities if enabled
+ if class_probabilities is True:
+
+ # determine number of classes
+ labels = np.unique(self.y)
+ nclasses = len(labels)
+
+ prob_out_raster = [0] * nclasses
+ prob = [0] * nclasses
+
+ for iclass in range(nclasses):
+ prob_out_raster[iclass] = output + \
+ '_classPr' + str(int(labels[iclass]))
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ """
+ Prediction using row blocks
+ """
+
+ for rowblock in range(0, current.rows, rowincr):
+
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
+ mask_np_row = np.zeros((rowincr, current.cols))
+
+ # loop through each row, and each band
+ # and add these values to the 2D array img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
+
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
+
+ mask_np_row[mask_np_row == -2147483648] = np.nan
+ nanmask = np.isnan(mask_np_row) # True in mask means invalid data
+
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaN values and GRASS CELL nodata vals
+ flat_pixels[flat_pixels == -2147483648] = np.nan
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # onehot-encoding
+ if self.enc is not None:
+ try:
+ flat_pixels = self.enc.transform(flat_pixels)
+ except:
+ # if this fails it is because the onehot-encoder was fitted
+ # on the training samples, but the prediction data contains
+ # new values, i.e. the training data has not sampled all of
+ # categories
+ grass.fatal('There are values in the categorical rasters '
+ 'that are not present in the training data '
+ 'set, i.e. the training data has not sampled '
+ 'all of the categories')
+
+ # rescale
+ if self.scaler is not None:
+ # create mask so that indices that represent categorical
+ # predictors are not selected
+ if self.categorical_var is not None:
+ idx = np.arange(self.X.shape[1])
+ mask = np.ones(len(idx), dtype=bool)
+ mask[self.categorical_var] = False
+ else:
+ mask = np.arange(self.X.shape[1])
+ flat_pixels_continuous = flat_pixels[:, mask]
+ flat_pixels[:, mask] = self.scaler.transform(
+ flat_pixels_continuous)
+
+ # perform prediction
+ result = self.estimator.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
+
+ # replace NaN values so that the prediction does not have a border
+ result = np.ma.masked_array(
+ result, mask=nanmask, fill_value=-99999)
+
+ # return a copy of result, with masked values filled with a value
+ result = result.filled([nodata])
+
+ # for each row we can perform computation, and write the result
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
+
+ # same for probabilities
+ if class_probabilities is True:
+ result_proba = self.estimator.predict_proba(flat_pixels)
+
+ for iclass in range(result_proba.shape[1]):
+
+ result_proba_class = result_proba[:, iclass]
+ result_proba_class = result_proba_class.reshape(
+ (rowincr, current.cols))
+
+ result_proba_class = np.ma.masked_array(
+ result_proba_class, mask=nanmask, fill_value=np.nan)
+
+ result_proba_class = result_proba_class.filled([np.nan])
+
+ for row in range(rowincr):
+
+ newrow = Buffer((
+ result_proba_class.shape[1],),
+ mtype='FCELL')
+
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
+ # close all maps
+ for i in range(n_features):
+ rasstack[i].close()
+
+ classification.close()
+ mask_raster.close()
+
+ try:
+ for iclass in range(nclasses):
+ prob[iclass].close()
+ except:
+ pass
+
+
+def model_classifiers(estimator='LogisticRegression', random_state=None,
+ C=1, max_depth=None, max_features='auto',
+ min_samples_split=2, min_samples_leaf=1,
+ n_estimators=100, subsample=1.0,
+ learning_rate=0.1, max_degree=1):
+
+ """
+ Provides the classifiers and parameters using by the module
+
+ Args
+ ----
+ estimator: Name of estimator
+ random_state: Seed to use in randomized components
+ C: Inverse of regularization strength
+ max_depth: Maximum depth for tree-based methods
+ min_samples_split: Minimum number of samples to split a node
+ min_samples_leaf: Minimum number of samples to form a leaf
+ n_estimators: Number of trees
+ subsample: Controls randomization in gradient boosting
+ learning_rate: Used in gradient boosting
+ max_degree: For earth
+
+ Returns
+ -------
+ clf: Scikit-learn classifier object
+ mode: Flag to indicate whether classifier performs classification or
+ regression
+ """
+
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+ from sklearn.naive_bayes import GaussianNB
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+ from sklearn.ensemble import GradientBoostingClassifier
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.svm import SVC
+
+ # optional packages that add additional classifiers here
+ if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
+ try:
+ from sklearn.pipeline import Pipeline
+ from pyearth import Earth
+
+ earth_classifier = Pipeline([('Earth',
+ Earth(max_degree=max_degree)),
+ ('Logistic', LogisticRegression())])
+
+ classifiers = {'EarthClassifier': earth_classifier,
+ 'EarthRegressor': Earth(max_degree=max_degree)}
+ except:
+ grass.fatal('Py-earth package not installed')
+
+ elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
+ try:
+ from xgboost import XGBClassifier, XGBRegressor
+
+ if max_depth is None:
+ max_depth = int(3)
+
+ classifiers = {
+ 'XGBClassifier':
+ XGBClassifier(learning_rate=learning_rate,
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ subsample=subsample),
+ 'XGBRegressor':
+ XGBRegressor(learning_rate=learning_rate,
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ subsample=subsample)}
+ except:
+ grass.fatal('XGBoost package not installed')
+ else:
+ # core sklearn classifiers go here
+ classifiers = {
+ 'SVC': SVC(C=C, probability=True, random_state=random_state),
+ 'LogisticRegression':
+ LogisticRegression(C=C, random_state=random_state, n_jobs=-1,
+ fit_intercept=True),
+ 'DecisionTreeClassifier':
+ DecisionTreeClassifier(max_depth=max_depth,
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state),
+ 'DecisionTreeRegressor':
+ DecisionTreeRegressor(max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state),
+ 'RandomForestClassifier':
+ RandomForestClassifier(n_estimators=n_estimators,
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state,
+ n_jobs=-1,
+ oob_score=False),
+ 'RandomForestRegressor':
+ RandomForestRegressor(n_estimators=n_estimators,
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state,
+ n_jobs=-1,
+ oob_score=False),
+ 'GradientBoostingClassifier':
+ GradientBoostingClassifier(learning_rate=learning_rate,
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ subsample=subsample,
+ max_features=max_features,
+ random_state=random_state),
+ 'GradientBoostingRegressor':
+ GradientBoostingRegressor(learning_rate=learning_rate,
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ subsample=subsample,
+ max_features=max_features,
+ random_state=random_state),
+ 'GaussianNB': GaussianNB(),
+ 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+ 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+ }
+
+ # define classifier
+ clf = classifiers[estimator]
+
+ # classification or regression
+ if estimator == 'LogisticRegression' \
+ or estimator == 'DecisionTreeClassifier' \
+ or estimator == 'RandomForestClassifier' \
+ or estimator == 'GradientBoostingClassifier' \
+ or estimator == 'GaussianNB' \
+ or estimator == 'LinearDiscriminantAnalysis' \
+ or estimator == 'QuadraticDiscriminantAnalysis' \
+ or estimator == 'EarthClassifier' \
+ or estimator == 'XGBClassifier' \
+ or estimator == 'SVC':
+ mode = 'classification'
+ else:
+ mode = 'regression'
+
+ return (clf, mode)
+
+
+def save_training_data(X, y, groups, file):
+
+ """
+ Saves any extracted training data to a csv file
+
+ Args
+ ----
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels
+ file: Path to a csv file to save data to
+ """
+
+ # if there are no group labels, create a nan filled array
+ if groups is None:
+ groups = np.empty((y.shape[0]))
+ groups[:] = np.nan
+
+ training_data = np.column_stack([X, y, groups])
+ np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+
+ """
+ Loads training data and labels from a csv file
+
+ Args
+ ----
+ file: Path to a csv file to save data to
+
+ Returns
+ -------
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels, or None
+ """
+
+ training_data = np.loadtxt(file, delimiter=',')
+ n_cols = training_data.shape[1]
+ last_Xcol = n_cols-2
+
+ # check to see if last column contains group labels or nans
+ groups = training_data[:, -1]
+
+ # if all nans then set groups to None
+ if bool(np.isnan(groups).all()) is True:
+ groups = None
+
+ # fetch X and y
+ X = training_data[:, 0:last_Xcol]
+ y = training_data[:, -2]
+
+ return(X, y, groups)
+
+
+def extract(response, predictors, impute=False, shuffle_data=True,
+ lowmem=False, random_state=None):
+
+ """
+ Samples a list of GRASS rasters using a labelled raster
+ Per raster sampling
+
+ Args
+ ----
+ response: String; GRASS raster with labelled pixels
+ predictors: List of GRASS rasters containing explanatory variables
+
+ Returns
+ -------
+
+ training_data: Numpy array of extracted raster values
+ training_labels: Numpy array of labels
+ y_indexes: Row and Columns of label positions
+ """
+
+ from sklearn.utils import shuffle
+ from sklearn.preprocessing import Imputer
+
+ current = Region()
+
+ # open response raster as rasterrow and read as np array
+ if RasterRow(response).exist() is True:
+ roi_gr = RasterRow(response)
+ roi_gr.open('r')
+
+ if lowmem is False:
+ response_np = np.array(roi_gr)
+ else:
+ response_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+ response_np[:] = np.array(roi_gr)[:]
+ else:
+ grass.fatal("GRASS response raster does not exist.... exiting")
+
+ # determine number of predictor rasters
+ n_features = len(predictors)
+
+ # check to see if all predictors exist
+ for i in range(n_features):
+ if RasterRow(predictors[i]).exist() is not True:
+ grass.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # check if any of those pixels are labelled (not equal to nodata)
+ # can use even if roi is FCELL because nodata will be nan
+ is_train = np.nonzero(response_np > -2147483648)
+ training_labels = response_np[is_train]
+ n_labels = np.array(is_train).shape[1]
+
+ # Create a zero numpy array of len training labels
+ if lowmem is False:
+ training_data = np.zeros((n_labels, n_features))
+ else:
+ training_data = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(n_labels, n_features))
+
+ # Loop through each raster and sample pixel values at training indexes
+ if lowmem is True:
+ feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+
+ for f in range(n_features):
+ predictor_gr = RasterRow(predictors[f])
+ predictor_gr.open('r')
+
+ if lowmem is False:
+ feature_np = np.array(predictor_gr)
+ else:
+ feature_np[:] = np.array(predictor_gr)[:]
+
+ training_data[0:n_labels, f] = feature_np[is_train]
+
+ # close each predictor map
+ predictor_gr.close()
+
+ # convert any CELL maps no datavals to NaN in the training data
+ for i in range(n_features):
+ training_data[training_data[:, i] == -2147483648] = np.nan
+
+ # convert indexes of training pixels from tuple to n*2 np array
+ is_train = np.array(is_train).T
+
+ # impute missing values
+ if impute is True:
+ missing = Imputer(strategy='median')
+ training_data = missing.fit_transform(training_data)
+
+ # Remove nan rows from training data
+ X = training_data[~np.isnan(training_data).any(axis=1)]
+ y = training_labels[~np.isnan(training_data).any(axis=1)]
+ y_indexes = is_train[~np.isnan(training_data).any(axis=1)]
+
+ # shuffle the training data
+ if shuffle_data is True:
+ X, y, y_indexes = shuffle(X, y, y_indexes, random_state=random_state)
+
+ # close the response map
+ roi_gr.close()
+
+ return(X, y, y_indexes)
+
+
+def maps_from_group(group):
+
+ """
+ Parse individual rasters into a list from an imagery group
+
+ Args
+ ----
+ group: String; GRASS imagery group
+ Returns
+ -------
+ maplist: Python list containing individual GRASS raster maps
+ """
+ groupmaps = im.group(group=group, flags="g",
+ quiet=True, stdout_=PIPE).outputs.stdout
+
+ maplist = groupmaps.split(os.linesep)
+ maplist = maplist[0:len(maplist)-1]
+ map_names = []
+
+ for rastername in maplist:
+ map_names.append(rastername.split('@')[0])
+
+ return(maplist, map_names)
More information about the grass-commit
mailing list