[GRASS-SVN] r70943 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Sun Apr 23 22:03:48 PDT 2017
Author: spawley
Date: 2017-04-23 22:03:47 -0700 (Sun, 23 Apr 2017)
New Revision: 70943
Added:
grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py
Modified:
grass-addons/grass7/raster/r.learn.ml/Makefile
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
Log:
r.learn.ml moved functions into separate module
Modified: grass-addons/grass7/raster/r.learn.ml/Makefile
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/Makefile 2017-04-24 03:25:59 UTC (rev 70942)
+++ grass-addons/grass7/raster/r.learn.ml/Makefile 2017-04-24 05:03:47 UTC (rev 70943)
@@ -1,7 +1,9 @@
MODULE_TOPDIR = ../..
PGM = r.learn.ml
-
+
+ETCFILES = r_learn_utils
+
include $(MODULE_TOPDIR)/include/Make/Script.make
include $(MODULE_TOPDIR)/include/Make/Python.make
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-24 03:25:59 UTC (rev 70942)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-24 05:03:47 UTC (rev 70943)
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -- coding: utf-8 --
-
+#
############################################################################
# MODULE: r.learn.ml
# AUTHOR: Steven Pawley
@@ -323,911 +323,122 @@
import itertools
from copy import deepcopy
import numpy as np
-from numpy.random import RandomState
-import grass.script as grass
+from grass.pygrass.utils import set_path
+import grass.script as gscript
from grass.pygrass.modules.shortcuts import raster as r
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
-from grass.pygrass.modules.shortcuts import imagery as im
-from grass.pygrass.vector import VectorTopo
-from grass.pygrass.vector.table import Link
-from grass.pygrass.utils import get_raster_for_points
-from subprocess import PIPE
+set_path('r.learn.ml')
+from r_learn_utils import (
+ cross_val_scores, predict, model_classifiers, save_training_data,
+ load_training_data, extract, maps_from_group, extract_points)
+
+
tmp_rast = []
def cleanup():
for rast in tmp_rast:
- grass.run_command("g.remove", name=rast, type='raster', flags='f', quiet=True)
+ gscript.run_command("g.remove", name=rast, type='raster', flags='f', quiet=True)
-def specificity_score(y_true, y_pred):
- """
- Function to calculate specificity score
- Args
- ----
- y_true: 1D numpy array of truth values
- y_pred: 1D numpy array of predicted classes
-
- Returns
- -------
- specificity: specificity score
- """
- from sklearn.metrics import confusion_matrix
-
- cm = confusion_matrix(y_true, y_pred)
- tn = float(cm[0][0])
- fp = float(cm[0][1])
-
- return (tn/(tn+fp))
-
-
-def varimp_permutation(estimator, X_test, y_true,
- n_permutations, scorer,
- random_state):
-
- """
- Method to perform permutation-based feature importance during
- cross-validation (cross-validation is applied externally to this
- method)
-
- Procedure is:
- 1. Pass fitted estimator and test partition X y
- 2. Assess AUC on the test partition (bestauc)
- 3. Permute each variable and assess the difference between bestauc and
- the messed-up variable
- 4. Repeat (3) for many random permutations
- 5. Average the repeats
-
- Args
- ----
- estimator: estimator that has been fitted to a training partition
- X_test, y_true: data and labels from a test partition
- n_permutations: number of random permutations to apply
- scorer: scikit-learn metric function to use
- random_state: seed to pass to the numpy random.seed
-
- Returns
- -------
- scores: scores for each predictor following permutation
- """
-
- # calculate score on original variables without permutation
- # determine best metric type for binary/multiclass/regression scenarios
- y_pred = estimator.predict(X_test)
- best_score = scorer(y_true, y_pred)
-
- np.random.seed(seed=random_state)
- rstate = RandomState(random_state)
- scores = np.zeros((n_permutations, X_test.shape[1]))
-
- # outer loop to repeat the pemutation rep times
- for rep in range(n_permutations):
-
- # inner loop to permute each predictor variable and assess
- # difference in auc
- for i in range(X_test.shape[1]):
- Xscram = np.copy(X_test)
- Xscram[:, i] = rstate.choice(X_test[:, i], X_test.shape[0])
-
- # fit the model on the training data and predict the test data
- y_pred = estimator.predict(Xscram)
- scores[rep, i] = best_score-scorer(y_true, y_pred)
- if scores[rep, i] < 0:
- scores[rep, i] = 0
-
- # average the repetitions
- scores = scores.mean(axis=0)
-
- return(scores)
-
-
-def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
- scoring=['accuracy'], feature_importances=False,
- n_permutations=25, models=False, random_state=None):
-
- """
- Stratified Kfold and GroupFold cross-validation using multiple
- scoring metrics and permutation feature importances
-
- Args
- ----
- estimator: Scikit learn estimator
- X: 2D numpy array of training data
- y: 1D numpy array representing response variable
- groups: 1D numpy array containing group labels
- sample_weight: 1D numpy array[n_samples,] of sample weights
- cv: Integer of cross-validation folds or sklearn.model_selection object
- sampling: Over- or under-sampling object with fit_sample method
- scoring: List of performance metrics to use
- feature_importances: Boolean to perform permutation-based importances
- n_permutations: Number of permutations during feature importance
- models: Boolean, return a list of the fitted models
- random_state: Seed to pass to the random number generator
- """
-
- from sklearn import metrics
- from sklearn.model_selection import (
- RandomizedSearchCV, GridSearchCV, StratifiedKFold)
-
- estimator = deepcopy(estimator)
- fitted_models = []
-
- # create model_selection method
- if isinstance(cv, int):
- cv = StratifiedKFold(n_splits=cv)
-
- # fit the model on the training data and predict the test data
- # need the groups parameter because the estimator can be a
- # RandomizedSearchCV or GridSearchCV estimator where cv=GroupKFold
- if isinstance(estimator, RandomizedSearchCV) is True \
- or isinstance(estimator, GridSearchCV):
- param_search = True
- else:
- param_search = False
-
- # create dictionary of lists to store metrics
- scores = dict.fromkeys(scoring)
- scores = {key: [] for key, value in scores.iteritems()}
- scoring_methods = {'accuracy': metrics.accuracy_score,
- 'balanced_accuracy': metrics.recall_score,
- 'average_precision': metrics.average_precision_score,
- 'brier_loss': metrics.brier_score_loss,
- 'kappa': metrics.cohen_kappa_score,
- 'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'hamming_loss': metrics.hamming_loss,
- 'jaccard_similarity': metrics.jaccard_similarity_score,
- 'log_loss': metrics.log_loss,
- 'matthews_corrcoef': metrics.matthews_corrcoef,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score,
- 'specificity': specificity_score,
- 'roc_auc': metrics.roc_auc_score,
- 'zero_one_loss': metrics.zero_one_loss,
- 'r2': metrics.r2_score,
- 'neg_mean_squared_error': metrics.mean_squared_error}
-
- byclass_methods = {'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score}
-
- # create diction to store byclass metrics results
- n_classes = len(np.unique(y))
- labels = np.unique(y)
- byclass_scores = dict.fromkeys(byclass_methods)
- byclass_scores = { key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
-
- # remove any byclass_scorers that are not in the scoring list
- byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
-
- # set averaging type for global binary or multiclass scores
- if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
- average='binary'
- else:
- average='macro'
-
- # check to see if scoring is a valid sklearn metric
- for i in scores.keys():
- try:
- list(scoring_methods.keys()).index(i)
- except:
- print('Scoring ' + i + ' is not a valid scoring method')
- print('Valid methods are:')
- print(scoring_methods.keys())
-
- # create np array to store feature importance scores
- if feature_importances is True:
- fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
- fimp[:] = np.nan
- else:
- fimp = None
-
- # generate Kfold indices
- if groups is None:
- k_fold = cv.split(X, y)
- else:
- k_fold = cv.split(X, y, groups=groups)
-
- # train on k-1 folds and test of k folds
- for train_indices, test_indices in k_fold:
-
- # subset training and test fold data
- X_train, X_test = X[train_indices], X[test_indices]
- y_train, y_test = y[train_indices], y[test_indices]
-
- # subset training and test fold group ids
- if groups is not None: groups_train = groups[train_indices]
- else: groups_train = None
-
- # subset training and test fold sample_weight
- if sample_weight is not None: weights = sample_weight[train_indices]
-
- # train estimator on training fold
- if groups is not None and param_search is True:
- if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
- else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
- else:
- if sample_weight is None: estimator.fit(X_train, y_train)
- else: estimator.fit(X_train, y_train, sample_weight=weights)
-
- if models is True:
- fitted_models.append(deepcopy(estimator))
-
- # prediction of test fold
- y_pred = estimator.predict(X_test)
-
- # calculate global performance metrics
- for m in scores.keys():
- # metrics that require probabilties
- if m == 'brier_loss' or m == 'roc_auc':
- y_prob = estimator.predict_proba(X_test)[:, 1]
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_prob))
-
- # metrics that have no averaging for multiclass
- elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
- or m == 'hamming_loss' or m == 'jaccard_similarity' \
- or m == 'log_loss' or m == 'zero_one_loss' or m == 'matthews_corrcoef':
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_pred))
-
- # balanced accuracy
- elif m == 'balanced_accuracy':
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average='macro'))
-
- # metrics that have averaging for multiclass
- else:
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average=average))
-
- # calculate per-class performance metrics
- for key in byclass_scores.keys():
- byclass_scores[key] = np.vstack((
- byclass_scores[key], byclass_methods[key](
- y_test, y_pred, labels=labels, average=None)))
-
- # feature importances using permutation
- if feature_importances is True:
- if bool((np.isnan(fimp)).all()) is True:
- fimp = varimp_permutation(
- estimator, X_test, y_test, n_permutations,
- scoring_methods[scoring[0]],
- random_state)
- else:
- fimp = np.row_stack(
- (fimp, varimp_permutation(
- estimator, X_test, y_test,
- n_permutations, scoring_methods[scoring[0]],
- random_state)))
-
- return(scores, byclass_scores, fimp, fitted_models)
-
-
-def predict(estimator, predictors, output, predict_type='raw',
- index=None, rowincr=25):
-
- """
- Prediction on list of GRASS rasters using a fitted scikit learn model
-
- Args
- ----
- estimator: scikit-learn estimator object
- predictors: list of GRASS rasters
- output: Name of GRASS raster to output classification results
- predict_type: character, 'raw' for classification/regression;
- 'prob' for class probabilities
- index: Optional, list of class indices to export
- rowincr: Integer of raster rows to process at one time
- """
-
- # open predictors as list of rasterrow objects
- current = Region()
- n_features = len(predictors)
- rasstack = [0] * n_features
-
- for i in range(n_features):
- rasstack[i] = RasterRow(predictors[i])
- if rasstack[i].exist() is True:
- rasstack[i].open('r')
- else:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # Prediction using row blocks
- for rowblock in range(0, current.rows, rowincr):
- grass.percent(rowblock, current.rows, rowincr)
-
- # check that the row increment does not exceed the number of rows
- if rowblock+rowincr > current.rows:
- rowincr = current.rows - rowblock
- img_np_row = np.zeros((rowincr, current.cols, n_features))
-
- # loop through each row, and each band
- # and add these values to the 2D array img_np_row
- for row in range(rowblock, rowblock+rowincr, 1):
- for band in range(n_features):
- img_np_row[row-rowblock, :, band] = \
- np.array(rasstack[band][row])
-
- # create mask
- img_np_row[img_np_row == -2147483648] = np.nan
- mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
- for feature in range(n_features):
- invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
- mask[invalid_indexes] = np.nan
-
- # reshape each row-band matrix into a n*m array
- nsamples = rowincr * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
-
- # remove NaNs prior to passing to scikit-learn predict
- flat_pixels = np.nan_to_num(flat_pixels)
-
- # perform prediction for classification/regression
- if predict_type == 'raw':
- result = estimator.predict(flat_pixels)
- result = result.reshape((rowincr, current.cols))
-
- # determine nodata value and grass raster type
- if result.dtype == 'float':
- nodata = np.nan
- ftype = 'FCELL'
- else:
- nodata = -2147483648
- ftype = 'CELL'
-
- # replace NaN values so that the prediction does not have a border
- result[np.nonzero(np.isnan(mask))] = nodata
-
- # for each row we can perform computation, and write the result
- if rowblock == 0:
- # create and open RasterRow object for writing of classification result
- if predict_type == 'raw':
- classification = RasterRow(output)
- classification.open('w', ftype, overwrite=True)
-
- for row in range(rowincr):
- newrow = Buffer((result.shape[1],), mtype=ftype)
- newrow[:] = result[row, :]
- classification.put_row(newrow)
-
- # perform prediction for class probabilities
- if predict_type == 'prob':
- result_proba = estimator.predict_proba(flat_pixels)
-
- # on first loop determine number of probability classes
- # and open rasterrow objects for writing
- if rowblock == 0:
- if index is None:
- index = range(result_proba.shape[1])
- n_classes = len(index)
- else:
- n_classes = len(np.unique(index))
-
- # create and open RasterRow objects for probabilities
- prob_out_raster = [0] * n_classes
- prob = [0] * n_classes
- for iclass, label in enumerate(index):
- prob_out_raster[iclass] = output + '_classPr' + str(label)
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
-
- for iclass, label in enumerate(index):
- result_proba_class = result_proba[:, label]
- result_proba_class = result_proba_class.reshape((rowincr, current.cols))
- result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
-
- for row in range(rowincr):
- newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
-
- # close all maps
- for i in range(n_features): rasstack[i].close()
- if predict_type == 'raw': classification.close()
- if predict_type == 'prob':
- try:
- for iclass in range(n_classes):
- prob[iclass].close()
- except:
- pass
-
-
-def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
-
- """
- Provides the classifiers and parameters using by the module
-
- Args
- ----
- estimator: Name of estimator
- random_state: Seed to use in randomized components
- n_jobs: Integer, number of processing cores to use
- p: Dict, containing classifier setttings
- weights: None, or 'balanced' to add class_weights
-
- Returns
- -------
- clf: Scikit-learn classifier object
- mode: Flag to indicate whether classifier performs classification or
- regression
- """
-
- from sklearn.linear_model import LogisticRegression
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
- from sklearn.ensemble import (
- RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
- ExtraTreesRegressor)
- from sklearn.ensemble import GradientBoostingClassifier
- from sklearn.ensemble import GradientBoostingRegressor
- from sklearn.svm import SVC
-
- # optional packages that add additional classifiers here
- if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
- try:
- from sklearn.pipeline import Pipeline
- from pyearth import Earth
-
- earth_classifier = Pipeline([('Earth',
- Earth(max_degree=p['max_degree'])),
- ('Logistic', LogisticRegression(n_jobs=n_jobs))])
-
- classifiers = {'EarthClassifier': earth_classifier,
- 'EarthRegressor': Earth(max_degree=p['max_degree'])}
- except:
- grass.fatal('Py-earth package not installed')
-
- elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
- try:
- from xgboost import XGBClassifier, XGBRegressor
-
- if p['max_depth'] is None:
- p['max_depth'] = int(3)
-
- classifiers = {
- 'XGBClassifier':
- XGBClassifier(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- subsample=p['subsample'],
- nthread=n_jobs),
- 'XGBRegressor':
- XGBRegressor(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- subsample=p['subsample'],
- nthread=n_jobs)}
- except:
- grass.fatal('XGBoost package not installed')
- else:
- # core sklearn classifiers go here
- classifiers = {
- 'SVC': SVC(C=p['C'],
- class_weight=weights,
- probability=True,
- random_state=random_state),
- 'LogisticRegression':
- LogisticRegression(C=p['C'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- fit_intercept=True),
- 'DecisionTreeClassifier':
- DecisionTreeClassifier(max_depth=p['max_depth'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state),
- 'DecisionTreeRegressor':
- DecisionTreeRegressor(max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state),
- 'RandomForestClassifier':
- RandomForestClassifier(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'RandomForestRegressor':
- RandomForestRegressor(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'ExtraTreesClassifier':
- ExtraTreesClassifier(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'ExtraTreesRegressor':
- ExtraTreesRegressor(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'GradientBoostingClassifier':
- GradientBoostingClassifier(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- subsample=p['subsample'],
- max_features=p['max_features'],
- random_state=random_state),
- 'GradientBoostingRegressor':
- GradientBoostingRegressor(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- subsample=p['subsample'],
- max_features=p['max_features'],
- random_state=random_state),
- 'GaussianNB': GaussianNB(),
- 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
- 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
- }
-
- # define classifier
- clf = classifiers[estimator]
-
- # classification or regression
- if estimator == 'LogisticRegression' \
- or estimator == 'DecisionTreeClassifier' \
- or estimator == 'RandomForestClassifier' \
- or estimator == 'ExtraTreesClassifier' \
- or estimator == 'GradientBoostingClassifier' \
- or estimator == 'GaussianNB' \
- or estimator == 'LinearDiscriminantAnalysis' \
- or estimator == 'QuadraticDiscriminantAnalysis' \
- or estimator == 'EarthClassifier' \
- or estimator == 'XGBClassifier' \
- or estimator == 'SVC':
- mode = 'classification'
- else:
- mode = 'regression'
-
- return (clf, mode)
-
-
-def save_training_data(X, y, groups, file):
-
- """
- Saves any extracted training data to a csv file
-
- Args
- ----
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels
- file: Path to a csv file to save data to
- """
-
- # if there are no group labels, create a nan filled array
- if groups is None:
- groups = np.empty((y.shape[0]))
- groups[:] = np.nan
-
- training_data = np.column_stack([X, y, groups])
- np.savetxt(file, training_data, delimiter=',')
-
-
-def load_training_data(file):
-
- """
- Loads training data and labels from a csv file
-
- Args
- ----
- file: Path to a csv file to save data to
-
- Returns
- -------
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels, or None
- """
-
- training_data = np.loadtxt(file, delimiter=',')
- n_cols = training_data.shape[1]
- last_Xcol = n_cols-2
-
- # check to see if last column contains group labels or nans
- groups = training_data[:, -1]
-
- # if all nans then set groups to None
- if bool(np.isnan(groups).all()) is True:
- groups = None
-
- # fetch X and y
- X = training_data[:, 0:last_Xcol]
- y = training_data[:, -2]
-
- return(X, y, groups)
-
-
-def extract(response, predictors, lowmem=False):
-
- """
- Samples a list of GRASS rasters using a labelled raster
- Per raster sampling
-
- Args
- ----
- response: String; GRASS raster with labelled pixels
- predictors: List of GRASS rasters containing explanatory variables
- lowmem: Boolean, use numpy memmap to query predictors
-
- Returns
- -------
-
- training_data: Numpy array of extracted raster values
- training_labels: Numpy array of labels
- is_train: Row and Columns of label positions
- """
-
- current = Region()
-
- # open response raster as rasterrow and read as np array
- if RasterRow(response).exist() is True:
- roi_gr = RasterRow(response)
- roi_gr.open('r')
-
- if lowmem is False:
- response_np = np.array(roi_gr)
- else:
- response_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
- response_np[:] = np.array(roi_gr)[:]
- else:
- grass.fatal("GRASS response raster does not exist.... exiting")
-
- # determine number of predictor rasters
- n_features = len(predictors)
-
- # check to see if all predictors exist
- for i in range(n_features):
- if RasterRow(predictors[i]).exist() is not True:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan
- is_train = np.nonzero(response_np > -2147483648)
- training_labels = response_np[is_train]
- n_labels = np.array(is_train).shape[1]
-
- # Create a zero numpy array of len training labels
- if lowmem is False:
- training_data = np.zeros((n_labels, n_features))
- else:
- training_data = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(n_labels, n_features))
-
- # Loop through each raster and sample pixel values at training indexes
- if lowmem is True:
- feature_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
-
- for f in range(n_features):
- predictor_gr = RasterRow(predictors[f])
- predictor_gr.open('r')
-
- if lowmem is False:
- feature_np = np.array(predictor_gr)
- else:
- feature_np[:] = np.array(predictor_gr)[:]
-
- training_data[0:n_labels, f] = feature_np[is_train]
-
- # close each predictor map
- predictor_gr.close()
-
- # convert any CELL maps no datavals to NaN in the training data
- for i in range(n_features):
- training_data[training_data[:, i] == -2147483648] = np.nan
-
- # convert indexes of training pixels from tuple to n*2 np array
- is_train = np.array(is_train).T
-
- # close the response map
- roi_gr.close()
-
- return(training_data, training_labels, is_train)
-
-
-def maps_from_group(group):
-
- """
- Parse individual rasters into a list from an imagery group
-
- Args
- ----
- group: String; GRASS imagery group
-
- Returns
- -------
- maplist: List containing individual GRASS raster maps
- map_names: List with print friendly map names
- """
- groupmaps = im.group(group=group, flags="g",
- quiet=True, stdout_=PIPE).outputs.stdout
-
- maplist = groupmaps.split(os.linesep)
- maplist = maplist[0:len(maplist)-1]
- map_names = []
-
- for rastername in maplist:
- map_names.append(rastername.split('@')[0])
-
- return(maplist, map_names)
-
-
-def extract_points(gvector, grasters, field):
- """
- Extract values from grass rasters using vector points input
-
- Args
- ----
- gvector: character, name of grass points vector
- grasters: list of names of grass raster to query
- field: character, name of field in table to use as response variable
-
- Returns
- -------
- X: 2D numpy array of training data
- y: 1D numpy array with the response variable
- coordinates: 2D numpy array of sample coordinates
- """
- # open grass vector
- points = VectorTopo(gvector.split('@')[0])
- points.open('r')
-
- # create link to attribute table
- points.dblinks.by_name(name=gvector)
-
- # extract table field to numpy array
- table = points.table
- cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
- y = np.array([np.isnan if c is None else c[0] for c in cur])
-
- # extract raster data
- X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
- for i, raster in enumerate(grasters):
- rio = RasterRow(raster)
- values = np.asarray(get_raster_for_points(points, rio))
- coordinates = values[:, 1:3]
- X[:, i] = values[:, 3]
-
- # set any grass integer nodata values to NaN
- X[X == -2147483648] = np.nan
-
- # remove missing response data
- X = X[~np.isnan(y)]
- coordinates = coordinates[~np.isnan(y)]
- y = y[~np.isnan(y)]
-
- # close
- points.close()
-
- return(X, y, coordinates)
-
def main():
try:
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import StandardScaler, Imputer
- from sklearn.model_selection import GridSearchCV
+ from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn import metrics
- from sklearn.metrics import make_scorer
+ from sklearn.metrics import make_scorer, confusion_matrix
import warnings
- warnings.filterwarnings('ignore') # turn off UndefinedMetricWarning
+ warnings.filterwarnings('ignore')
except:
- grass.fatal("Scikit learn 0.18 or newer is not installed")
+ gscript.fatal("Scikit learn 0.18 or newer is not installed")
+ # required gui section
group = options['group']
trainingmap = options['trainingmap']
trainingpoints = options['trainingpoints']
field = options['field']
output = options['output']
- if '@' in output:
- output = output.split('@')[0]
+
+ # classifier gui section
classifier = options['classifier']
- norm_data = flags['s']
+ hyperparams = {
+ 'C': options['c'],
+ 'min_samples_split': options['min_samples_split'],
+ 'min_samples_leaf': options['min_samples_leaf'],
+ 'n_estimators': options['n_estimators'],
+ 'learning_rate': options['learning_rate'],
+ 'subsample': options['subsample'],
+ 'max_depth': options['max_depth'],
+ 'max_features': options['max_features'],
+ 'max_degree': options['max_degree']
+ }
+
+ # cross validation
cv = int(options['cv'])
cvtype = options['cvtype']
group_raster = options['group_raster']
- categorymaps = options['categorymaps']
n_partitions = int(options['n_partitions'])
- modelonly = flags['m']
+ tune_only = flags['t']
+ predict_resamples = flags['r']
+ importances = flags['f']
+ n_permutations = int(options['n_permutations'])
+ errors_file = options['errors_file']
+ fimp_file = options['fimp_file']
+ param_file = options['param_file']
+
+ # general options
+ norm_data = flags['s']
+ categorymaps = options['categorymaps']
+ model_only = flags['m']
probability = flags['p']
prob_only = flags['z']
- tuneonly = flags['t']
- predict_resamples = flags['r']
rowincr = int(options['lines'])
random_state = int(options['random_state'])
model_save = options['save_model']
model_load = options['load_model']
load_training = options['load_training']
save_training = options['save_training']
- importances = flags['f']
-
indexes = options['indexes']
- if ',' in indexes:
- indexes = [int(i) for i in indexes.split(',')]
- else:
- indexes = [int(indexes)] # predict expects list
- if indexes == [-1]:
- indexes = None
- n_permutations = int(options['n_permutations'])
n_jobs = int(options['n_jobs'])
lowmem = flags['l']
impute = flags['i']
- errors_file = options['errors_file']
- fimp_file = options['fimp_file']
- param_file = options['param_file']
-
balance = flags['b']
- if balance is True:
- balance = 'balanced'
- else: balance = None
+
+ # convert to lists
if ',' in categorymaps:
categorymaps = [int(i) for i in categorymaps.split(',')]
else: categorymaps = None
+ if ',' in indexes:
+ indexes = [int(i) for i in indexes.split(',')]
+ else:
+ indexes = int(indexes)
+ if indexes == -1: indexes = None
+
# error checking
+ # remove @ from output in case overwriting result
+ if '@' in output:
+ output = output.split('@')[0]
+
# feature importances selected by no cross-validation scheme used
if importances is True and cv == 1:
- grass.fatal('Feature importances require cross-validation cv > 1')
+ gscript.fatal('Feature importances require cross-validation cv > 1')
- # output map has not been entered and modelonly is not set to True
- if output == '' and modelonly is True:
- grass.fatal('No output map specified')
+ # output map has not been entered and model_only is not set to True
+ if output == '' and model_only is True:
+ gscript.fatal('No output map specified')
# perform prediction only for class probabilities but probability flag is not set to True
if prob_only is True:
probability = True
# make dicts for hyperparameters, datatypes and parameters for tuning
- hyperparams = {'C': options['c'],
- 'min_samples_split': options['min_samples_split'],
- 'min_samples_leaf': options['min_samples_leaf'],
- 'n_estimators': options['n_estimators'],
- 'learning_rate': options['learning_rate'],
- 'subsample': options['subsample'],
- 'max_depth': options['max_depth'],
- 'max_features': options['max_features'],
- 'max_degree': options['max_degree']}
hyperparams_type = dict.fromkeys(hyperparams, int)
hyperparams_type['C'] = float
hyperparams_type['learning_rate'] = float
@@ -1257,14 +468,16 @@
# scoring metrics
if mode == 'classification':
- scoring = ['accuracy', 'precision', 'recall', 'f1', 'kappa', 'balanced_accuracy']
+ scoring = ['accuracy', 'precision', 'recall', 'f1', 'kappa',\
+ 'balanced_accuracy']
search_scorer = make_scorer(metrics.cohen_kappa_score)
else:
scoring = ['r2', 'neg_mean_squared_error']
search_scorer = 'r2'
- # Sample training data and group ids
- # ----------------------------------
+ # -------------------------------------------------------------------------
+ # Extract training data
+ # -------------------------------------------------------------------------
# fetch individual raster names from group
maplist, map_names = maps_from_group(group)
@@ -1275,11 +488,10 @@
if load_training != '':
X, y, group_id = load_training_data(load_training)
else:
- grass.message('Extracting training data')
+ gscript.message('Extracting training data')
- # clump the labelled pixel raster if labels represent polygons
- # then set the group_raster to the clumped raster to extract the
- # group_ids used in the GroupKFold cross-validation
+ # generate spatial clump/patch partitions
+ # clump the labelled pixel raster and set the group_raster to the clumped raster
if trainingmap != '' and cvtype == 'clumped' and group_raster == '':
clumped_trainingmap = 'tmp_clumped_trainingmap'
tmp_rast.append(clumped_trainingmap)
@@ -1287,47 +499,41 @@
overwrite=True, quiet=True)
group_raster = clumped_trainingmap
elif trainingmap == '' and cvtype == 'clumped':
- grass.fatal('Cross-validation using clumped training areas ',
- 'requires raster-based training areas')
+ gscript.fatal('Cross-validation using clumped training areas ',
+ 'requires raster-based training areas')
- # extract training data from maplist and take group ids from
- # group_raster. Shuffle=False so that group ids and labels align
- # because cross-validation will be performed spatially
+ # append spatial clumps or group raster to the predictors
if group_raster != '':
maplist2 = deepcopy(maplist)
maplist2.append(group_raster)
- if trainingmap != '':
- X, y, sample_coords = extract(
- response=trainingmap, predictors=maplist2, lowmem=lowmem)
- elif trainingpoints != '':
- X, y, sample_coords = extract_points(
- trainingpoints, maplist2, field)
+ else:
+ maplist2 = maplist
- # take group id from last column and remove from predictors
+ # extract training data
+ if trainingmap != '':
+ X, y, sample_coords = extract(
+ response=trainingmap, predictors=maplist2, lowmem=lowmem)
+ elif trainingpoints != '':
+ X, y, sample_coords = extract_points(
+ trainingpoints, maplist2, field)
+ group_id = None
+
+ # take group id from last column and remove from predictors
+ if group_raster != '':
group_id = X[:, -1]
X = np.delete(X, -1, axis=1)
- else:
- # extract training data from maplist without group Ids
- # shuffle this data by default
- if trainingmap != '':
- X, y, sample_coords = extract(
- response=trainingmap, predictors=maplist, lowmem=lowmem)
- elif trainingpoints != '':
- X, y, sample_coords = extract_points(
- trainingpoints, maplist, field)
- group_id = None
- if cvtype == 'kmeans':
- clusters = KMeans(
- n_clusters=n_partitions,
- random_state=random_state, n_jobs=n_jobs)
+ if cvtype == 'kmeans':
+ clusters = KMeans(
+ n_clusters=n_partitions,
+ random_state=random_state, n_jobs=n_jobs)
+ clusters.fit(sample_coords)
+ group_id = clusters.labels_
- clusters.fit(sample_coords)
- group_id = clusters.labels_
# check for labelled pixels and training data
if y.shape[0] == 0 or X.shape[0] == 0:
- grass.fatal('No training pixels or pixels in imagery group '
- '...check computational region')
+ gscript.fatal('No training pixels or pixels in imagery group '
+ '...check computational region')
# impute or remove NaNs
if impute is False:
@@ -1347,13 +553,17 @@
X, y, sample_coords, group_id = shuffle(
X, y, sample_coords, group_id, random_state=random_state)
- # option to save extracted data to .csv file
- if save_training != '':
- save_training_data(X, y, group_id, save_training)
+ # optionally save extracted data to .csv file
+ if save_training != '':
+ save_training_data(X, y, group_id, save_training)
+ # ---------------------------------------------------------------------
+ # define the hyperparameter inner search cross validation method
+ # ---------------------------------------------------------------------
+
# define model selection cross-validation method
if any(param_grid) is True and cv == 1:
- grass.fatal('Hyperparameter search requires cv > 1')
+ gscript.fatal('Hyperparameter search requires cv > 1')
if any(param_grid) is True or cv > 1:
if group_id is None:
resampling = StratifiedKFold(
@@ -1363,11 +573,8 @@
else:
resampling = None
- # define preprocessing pipeline
- # -----------------------------
-
# sample weights for GradientBoosting or XGBClassifier
- if balance == 'balanced' and mode == 'classification' and classifier in (
+ if balance is True and mode == 'classification' and classifier in (
'GradientBoostingClassifier', 'XGBClassifier'):
from sklearn.utils import compute_class_weight
class_weights = compute_class_weight(
@@ -1375,18 +582,26 @@
else:
class_weights = None
- # scaling and onehot encoding
+ # ---------------------------------------------------------------------
+ # define the preprocessing pipeline
+ # ---------------------------------------------------------------------
+
+ # standardization
if norm_data is True and categorymaps is None:
clf = Pipeline([('scaling', StandardScaler()),
('classifier', clf)])
+
+ # onehot encoding
if categorymaps is not None and norm_data is False:
enc = OneHotEncoder(categorical_features=categorymaps)
enc.fit(X)
clf = Pipeline([('onehot', OneHotEncoder(
categorical_features=categorymaps,
- n_values=enc.n_values_, handle_unknown='ignore', # prevent failure due to new categorical vars
+ n_values=enc.n_values_, handle_unknown='ignore',
sparse=False)), # dense because not all clf can use sparse
('classifier', clf)])
+
+ # standardization and onehot encoding
if norm_data is True and categorymaps is not None:
enc = OneHotEncoder(categorical_features=categorymaps)
enc.fit(X)
@@ -1397,8 +612,9 @@
('scaling', StandardScaler()),
('classifier', clf)])
- # define hyperparameter grid search
- # ---------------------------------
+ # ---------------------------------------------------------------------
+ # create the hyperparameter grid search method
+ # ---------------------------------------------------------------------
# check if dict contains and keys - perform GridSearchCV
if any(param_grid) is True:
@@ -1414,85 +630,88 @@
estimator=clf, param_grid=param_grid, scoring=search_scorer,
n_jobs=n_jobs, cv=resampling)
+ # ---------------------------------------------------------------------
# classifier training
- # -------------------
+ # ---------------------------------------------------------------------
- # fit and parameter search
- grass.message(os.linesep)
- grass.message(('Fitting model using ' + classifier))
+ gscript.message(os.linesep)
+ gscript.message(('Fitting model using ' + classifier))
- # use groups if GroupKFold and param_grid are present
+ # pass groups to fit parameter GroupKFold and param_grid are present
if isinstance(resampling, GroupKFold) and any(param_grid) is True:
- if balance == 'balanced' and classifier in (
+ if balance is True and classifier in (
'GradientBoostingClassifier', 'XGBClassifier'):
clf.fit(X=X, y=y, groups=group_id, sample_weight=class_weights)
else:
clf.fit(X=X, y=y, groups=group_id)
else:
- if balance == 'balanced' and classifier in (
+ if balance is True and classifier in (
'GradientBoostingClassifier', 'XGBClassifier'):
clf.fit(X=X, y=y, sample_weight=class_weights)
else:
clf.fit(X, y)
+ # message best hyperparameter setup and optionally save using pandas
if any(param_grid) is True:
- grass.message(os.linesep)
- grass.message('Best parameters:')
- grass.message(str(clf.best_params_))
+ gscript.message(os.linesep)
+ gscript.message('Best parameters:')
+ gscript.message(str(clf.best_params_))
if param_file != '':
try:
import pandas as pd
param_df = pd.DataFrame(clf.cv_results_)
param_df.to_csv(param_file)
except:
- grass.message((
+ gscript.message((
"Pandas is not installed ",
"cannot export hyperparameter search results to csv"))
+ # ---------------------------------------------------------------------
# cross-validation
- # -----------------
+ # ---------------------------------------------------------------------
# If cv > 1 then use cross-validation to generate performance measures
- if cv > 1 and tuneonly is not True:
+ if cv > 1 and tune_only is not True:
if mode == 'classification' and cv > np.histogram(
y, bins=len(np.unique(y)))[0].min():
- grass.message(os.linesep)
- grass.message('Number of cv folds is greater than number of '
- 'samples in some classes. Cross-validation is being'
- ' skipped')
+ gscript.message(os.linesep)
+ gscript.message('Number of cv folds is greater than number of '
+ 'samples in some classes. Cross-validation is being'
+ ' skipped')
else:
- grass.message(os.linesep)
- grass.message(
+ gscript.message(os.linesep)
+ gscript.message(
"Cross validation global performance measures......:")
- # cross-validate
+ # add auc and mcc as scorer if classification is binary
if mode == 'classification' and \
len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
scoring.append('roc_auc')
scoring.append('matthews_corrcoef')
+ # perform the cross-validatation
scores, cscores, fimp, models = cross_val_scores(
clf, X, y, group_id, class_weights, resampling, scoring,
importances, n_permutations, predict_resamples, random_state)
# global scores
for method, val in scores.iteritems():
- grass.message(
+ gscript.message(
method+":\t%0.3f\t+/-SD\t%0.3f" %
(val.mean(), val.std()))
# individual class scores
if mode == 'classification' and len(np.unique(y)) != 2:
- grass.message(os.linesep)
- grass.message('Cross validation class performance measures......:')
- grass.message('Class \t' + '\t'.join(map(str, np.unique(y))))
+ gscript.message(os.linesep)
+ gscript.message('Cross validation class performance measures......:')
+ gscript.message('Class \t' + '\t'.join(map(str, np.unique(y))))
for method, val in cscores.iteritems():
mat_cscores = np.matrix(val)
- grass.message(
+ gscript.message(
method+':\t' + '\t'.join(
map(str, np.round(mat_cscores.mean(axis=0), 2)[0])))
- grass.message(
+ gscript.message(
method+' std:\t' + '\t'.join(
map(str, np.round(mat_cscores.std(axis=0), 2)[0])))
@@ -1503,13 +722,13 @@
# feature importances
if importances is True:
- grass.message(os.linesep)
- grass.message("Feature importances")
- grass.message("id" + "\t" + "Raster" + "\t" + "Importance")
+ gscript.message(os.linesep)
+ gscript.message("Feature importances")
+ gscript.message("id" + "\t" + "Raster" + "\t" + "Importance")
# mean of cross-validation feature importances
for i in range(len(fimp.mean(axis=0))):
- grass.message(
+ gscript.message(
str(i) + "\t" + maplist[i] +
"\t" + str(round(fimp.mean(axis=0)[i], 4)))
@@ -1526,48 +745,42 @@
if model_save != '':
joblib.dump(clf, model_save)
- # prediction on the rest of the GRASS rasters in the imagery group
- # ----------------------------------------------------------------
+ # -------------------------------------------------------------------------
+ # prediction on grass imagery group
+ # -------------------------------------------------------------------------
- if modelonly is not True:
- grass.message(os.linesep)
- if mode == 'classification':
- if prob_only is False:
- grass.message('Predicting classification raster...')
- predict(estimator=clf, predictors=maplist, output=output, predict_type='raw',
- rowincr=rowincr)
+ if model_only is not True:
+ gscript.message(os.linesep)
- if predict_resamples is True:
- for i in range(cv):
- resample_name = output + '_Resample' + str(i)
- predict(estimator=models[i], predictors=maplist, output=resample_name, predict_type='raw',
- rowincr=rowincr)
+ # predict classification/regression raster
+ if prob_only is False:
+ gscript.message('Predicting classification/regression raster...')
+ predict(estimator=clf, predictors=maplist, output=output,
+ predict_type='raw', rowincr=rowincr)
- if probability is True:
- grass.message('Predicting class probabilities...')
- predict(estimator=clf, predictors=maplist, output=output, predict_type='prob',
- index=indexes, rowincr=rowincr)
+ if predict_resamples is True:
+ for i in range(cv):
+ resample_name = output + '_Resample' + str(i)
+ predict(estimator=models[i], predictors=maplist,
+ output=resample_name, predict_type='raw',
+ rowincr=rowincr)
- if predict_resamples is True:
- for i in range(cv):
- resample_name = output + '_Resample' + str(i)
- predict(estimator=models[i], predictors=maplist, output=resample_name, predict_type='prob',
- index=indexes, rowincr=rowincr)
+ # predict class probabilities
+ if probability is True:
+ gscript.message('Predicting class probabilities...')
+ predict(estimator=clf, predictors=maplist, output=output, predict_type='prob',
+ index=indexes, rowincr=rowincr)
- elif mode == 'regression':
- grass.message('Predicting regression raster...')
- predict(estimator=clf, predictors=maplist, output=output, predict_type='raw',
- rowincr=rowincr)
-
if predict_resamples is True:
for i in range(cv):
resample_name = output + '_Resample' + str(i)
- predict(estimator=models[i], predictors=maplist, output=resample_name, predict_type='prob',
- rowincr=rowincr)
+ predict(estimator=models[i], predictors=maplist,
+ output=resample_name, predict_type='prob',
+ index=indexes, rowincr=rowincr)
else:
- grass.message("Model built and now exiting")
+ gscript.message("Model built and now exiting")
if __name__ == "__main__":
- options, flags = grass.parser()
+ options, flags = gscript.parser()
atexit.register(cleanup)
main()
Added: grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py (rev 0)
+++ grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py 2017-04-24 05:03:47 UTC (rev 70943)
@@ -0,0 +1,800 @@
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+import numpy as np
+from numpy.random import RandomState
+from grass.pygrass.modules.shortcuts import raster as r
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.modules.shortcuts import imagery as im
+from grass.pygrass.vector import VectorTopo
+from grass.pygrass.vector.table import Link
+from grass.pygrass.utils import get_raster_for_points
+from subprocess import PIPE
+
+def specificity_score(y_true, y_pred):
+ """
+ Calculate specificity score
+
+ Args
+ ----
+ y_true: 1D numpy array of truth values
+ y_pred: 1D numpy array of predicted classes
+
+ Returns
+ -------
+ specificity: specificity score
+ """
+ from sklearn.metrics import confusion_matrix
+
+ cm = confusion_matrix(y_true, y_pred)
+ tn = float(cm[0][0])
+ fp = float(cm[0][1])
+
+ return tn/(tn+fp)
+
+
+def varimp_permutation(estimator, X_test, y_true,
+ n_permutations, scorer,
+ random_state):
+
+ """
+ Method to perform permutation-based feature importance during
+ cross-validation (cross-validation is applied externally to this
+ method)
+
+ Procedure is:
+ 1. Pass fitted estimator and test partition X y
+ 2. Assess AUC on the test partition (bestauc)
+ 3. Permute each variable and assess the difference between bestauc and
+ the messed-up variable
+ 4. Repeat (3) for many random permutations
+ 5. Average the repeats
+
+ Args
+ ----
+ estimator: estimator that has been fitted to a training partition
+ X_test, y_true: data and labels from a test partition
+ n_permutations: number of random permutations to apply
+ scorer: scikit-learn metric function to use
+ random_state: seed to pass to the numpy random.seed
+
+ Returns
+ -------
+ scores: scores for each predictor following permutation
+ """
+
+ # calculate score on original variables without permutation
+ # determine best metric type for binary/multiclass/regression scenarios
+ y_pred = estimator.predict(X_test)
+ best_score = scorer(y_true, y_pred)
+
+ np.random.seed(seed=random_state)
+ rstate = RandomState(random_state)
+ scores = np.zeros((n_permutations, X_test.shape[1]))
+
+ # outer loop to repeat the pemutation rep times
+ for rep in range(n_permutations):
+
+ # inner loop to permute each predictor variable and assess
+ # difference in auc
+ for i in range(X_test.shape[1]):
+ Xscram = np.copy(X_test)
+ Xscram[:, i] = rstate.choice(X_test[:, i], X_test.shape[0])
+
+ # fit the model on the training data and predict the test data
+ y_pred = estimator.predict(Xscram)
+ scores[rep, i] = best_score-scorer(y_true, y_pred)
+ if scores[rep, i] < 0:
+ scores[rep, i] = 0
+
+ # average the repetitions
+ scores = scores.mean(axis=0)
+
+ return scores
+
+
+def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
+ scoring='accuracy', feature_importances=False,
+ n_permutations=25, models=False, random_state=None):
+ """
+ Stratified Kfold and GroupFold cross-validation using multiple
+ scoring metrics and permutation feature importances
+
+ Args
+ ----
+ estimator: Scikit learn estimator
+ X: 2D numpy array of training data
+ y: 1D numpy array representing response variable
+ groups: 1D numpy array containing group labels
+ sample_weight: 1D numpy array[n_samples,] of sample weights
+ cv: Integer of cross-validation folds or sklearn.model_selection object
+ sampling: Over- or under-sampling object with fit_sample method
+ scoring: List of performance metrics to use
+ feature_importances: Boolean to perform permutation-based importances
+ n_permutations: Number of permutations during feature importance
+ models: Boolean, return a list of the fitted models
+ random_state: Seed to pass to the random number generator
+ """
+
+ from sklearn import metrics
+ from sklearn.model_selection import (
+ RandomizedSearchCV, GridSearchCV, StratifiedKFold)
+
+ # deepcopy estimator
+ estimator = deepcopy(estimator)
+ fitted_models = []
+
+ # create model_selection method
+ if isinstance(cv, int): cv = StratifiedKFold(n_splits=cv)
+
+ # create dictionary of lists to store metrics
+ if isinstance(scoring, basestring): scoring = [scoring]
+ scores = dict.fromkeys(scoring)
+ scores = {key: [] for key, value in scores.iteritems()}
+ scoring_methods = {'accuracy': metrics.accuracy_score,
+ 'balanced_accuracy': metrics.recall_score,
+ 'average_precision': metrics.average_precision_score,
+ 'brier_loss': metrics.brier_score_loss,
+ 'kappa': metrics.cohen_kappa_score,
+ 'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'hamming_loss': metrics.hamming_loss,
+ 'jaccard_similarity': metrics.jaccard_similarity_score,
+ 'log_loss': metrics.log_loss,
+ 'matthews_corrcoef': metrics.matthews_corrcoef,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score,
+ 'specificity': specificity_score,
+ 'roc_auc': metrics.roc_auc_score,
+ 'zero_one_loss': metrics.zero_one_loss,
+ 'r2': metrics.r2_score,
+ 'neg_mean_squared_error': metrics.mean_squared_error}
+
+ byclass_methods = {'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score}
+
+ # create diction to store byclass metrics results
+ n_classes = len(np.unique(y))
+ labels = np.unique(y)
+ byclass_scores = dict.fromkeys(byclass_methods)
+ byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
+
+ # remove any byclass_scorers that are not in the scoring list
+ byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
+
+ # check if remaining scorers are valid sklearn metrics
+ for i in scores.keys():
+ try:
+ list(scoring_methods.keys()).index(i)
+ except:
+ print('Scoring ' + i + ' is not a valid scoring method')
+ print('Valid methods are:')
+ print(scoring_methods.keys())
+
+ # set averaging type for global binary or multiclass scores
+ if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+ average = 'binary'
+ else:
+ average = 'macro'
+
+ # create np array to store feature importance scores
+ if feature_importances is True:
+ fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
+ fimp[:] = np.nan
+ else:
+ fimp = None
+
+ # generate Kfold indices
+ if groups is None:
+ k_fold = cv.split(X, y)
+ else:
+ k_fold = cv.split(X, y, groups=groups)
+
+ # train on k-1 folds and test of k folds
+ for train_indices, test_indices in k_fold:
+
+ # create training and test folds
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+ if groups is not None: groups_train = groups[train_indices]
+ else: groups_train = None
+
+ # subset training and test fold sample_weight
+ if sample_weight is not None: weights = sample_weight[train_indices]
+
+ # train estimator
+ if groups is not None and isinstance(estimator, (RandomizedSearchCV, GridSearchCV)) is True:
+ if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
+ else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
+ else:
+ if sample_weight is None: estimator.fit(X_train, y_train)
+ else: estimator.fit(X_train, y_train, sample_weight=weights)
+
+ # optionally store the fitted models on each resample
+ if models is True: fitted_models.append(deepcopy(estimator))
+
+ # prediction of test fold
+ y_pred = estimator.predict(X_test)
+
+ # calculate global performance metrics
+ for m in scores.keys():
+ # metrics that require probabilties
+ if m == 'brier_loss' or m == 'roc_auc':
+ y_prob = estimator.predict_proba(X_test)[:, 1]
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_prob))
+
+ # metrics that have no averaging for multiclass
+ elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
+ or m == 'hamming_loss' or m == 'jaccard_similarity' \
+ or m == 'log_loss' or m == 'zero_one_loss' or m == 'matthews_corrcoef':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_pred))
+
+ # balanced accuracy
+ elif m == 'balanced_accuracy':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average='macro'))
+
+ # metrics that have averaging for multiclass
+ else:
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average=average))
+
+ # calculate per-class performance metrics
+ for key in byclass_scores.keys():
+ byclass_scores[key] = np.vstack((
+ byclass_scores[key], byclass_methods[key](
+ y_test, y_pred, labels=labels, average=None)))
+
+ # feature importances using permutation
+ if feature_importances is True:
+ if bool((np.isnan(fimp)).all()) is True:
+ fimp = varimp_permutation(
+ estimator, X_test, y_test, n_permutations,
+ scoring_methods[scoring[0]],
+ random_state)
+ else:
+ fimp = np.row_stack(
+ (fimp, varimp_permutation(
+ estimator, X_test, y_test,
+ n_permutations, scoring_methods[scoring[0]],
+ random_state)))
+
+ return(scores, byclass_scores, fimp, fitted_models)
+
+
+def predict(estimator, predictors, output, predict_type='raw',
+ index=None, rowincr=25):
+ """
+ Prediction on list of GRASS rasters using a fitted scikit learn model
+
+ Args
+ ----
+ estimator: scikit-learn estimator object
+ predictors: list of GRASS rasters
+ output: Name of GRASS raster to output classification results
+ predict_type: character, 'raw' for classification/regression;
+ 'prob' for class probabilities
+ index: Optional, list of class indices to export
+ rowincr: Integer of raster rows to process at one time
+ """
+
+ # convert potential single index to list
+ if isinstance(index, int): index = [index]
+
+ # open predictors as list of rasterrow objects
+ current = Region()
+ n_features = len(predictors)
+ rasstack = [0] * n_features
+
+ for i in range(n_features):
+ rasstack[i] = RasterRow(predictors[i])
+ if rasstack[i].exist() is True:
+ rasstack[i].open('r')
+ else:
+ gscript.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # Prediction using blocks of rows per iteration
+ for rowblock in range(0, current.rows, rowincr):
+ gscript.percent(rowblock, current.rows, rowincr)
+
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
+
+ # loop through each row, and each band and add to 2D img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
+
+ # create mask
+ img_np_row[img_np_row == -2147483648] = np.nan
+ mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+ for feature in range(n_features):
+ invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+ mask[invalid_indexes] = np.nan
+
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaNs prior to passing to scikit-learn predict
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # perform prediction for classification/regression
+ if predict_type == 'raw':
+ result = estimator.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
+
+ # determine nodata value and grass raster type
+ if result.dtype == 'float':
+ nodata = np.nan
+ ftype = 'FCELL'
+ else:
+ nodata = -2147483648
+ ftype = 'CELL'
+
+ # replace NaN values so that the prediction does not have a border
+ result[np.nonzero(np.isnan(mask))] = nodata
+
+ # on first iteration create the RasterRow object
+ if rowblock == 0:
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftype, overwrite=True)
+
+ # write the classification result
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
+
+ # perform prediction for class probabilities
+ if predict_type == 'prob':
+ result_proba = estimator.predict_proba(flat_pixels)
+
+ # on first loop determine number of probability classes
+ # and open rasterrow objects for writing
+ if rowblock == 0:
+ if index is None:
+ index = range(result_proba.shape[1])
+ n_classes = len(index)
+ else:
+ n_classes = len(np.unique(index))
+
+ # create and open RasterRow objects for probabilities
+ prob_out_raster = [0] * n_classes
+ prob = [0] * n_classes
+ for iclass, label in enumerate(index):
+ prob_out_raster[iclass] = output + '_classPr' + str(label)
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ for iclass, label in enumerate(index):
+ result_proba_class = result_proba[:, label]
+ result_proba_class = result_proba_class.reshape((rowincr, current.cols))
+ result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
+
+ for row in range(rowincr):
+ newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
+ # close all maps
+ for i in range(n_features): rasstack[i].close()
+ if predict_type == 'raw': classification.close()
+ if predict_type == 'prob':
+ try:
+ for iclass in range(n_classes):
+ prob[iclass].close()
+ except:
+ pass
+
+
+def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
+ """
+ Provides the classifiers and parameters using by the module
+
+ Args
+ ----
+ estimator: Name of estimator
+ random_state: Seed to use in randomized components
+ n_jobs: Integer, number of processing cores to use
+ p: Dict, containing classifier setttings
+ weights: None, or 'balanced' to add class_weights
+
+ Returns
+ -------
+ clf: Scikit-learn classifier object
+ mode: Flag to indicate whether classifier performs classification or
+ regression
+ """
+
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+ from sklearn.naive_bayes import GaussianNB
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+ from sklearn.ensemble import (
+ RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
+ ExtraTreesRegressor)
+ from sklearn.ensemble import GradientBoostingClassifier
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.svm import SVC
+
+ # convert balanced boolean to scikit learn method
+ if weights is True:
+ weights = 'balanced'
+ else: weights = None
+
+ # optional packages that add additional classifiers here
+ if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
+ try:
+ from sklearn.pipeline import Pipeline
+ from pyearth import Earth
+
+ earth_classifier = Pipeline([('Earth',
+ Earth(max_degree=p['max_degree'])),
+ ('Logistic', LogisticRegression(n_jobs=n_jobs))])
+
+ classifiers = {'EarthClassifier': earth_classifier,
+ 'EarthRegressor': Earth(max_degree=p['max_degree'])}
+ except:
+ gscript.fatal('Py-earth package not installed')
+
+ elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
+ try:
+ from xgboost import XGBClassifier, XGBRegressor
+
+ if p['max_depth'] is None:
+ p['max_depth'] = int(3)
+
+ classifiers = {
+ 'XGBClassifier':
+ XGBClassifier(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ subsample=p['subsample'],
+ nthread=n_jobs),
+ 'XGBRegressor':
+ XGBRegressor(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ subsample=p['subsample'],
+ nthread=n_jobs)}
+ except:
+ gscript.fatal('XGBoost package not installed')
+ else:
+ # core sklearn classifiers go here
+ classifiers = {
+ 'SVC': SVC(C=p['C'],
+ class_weight=weights,
+ probability=True,
+ random_state=random_state),
+ 'LogisticRegression':
+ LogisticRegression(C=p['C'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ fit_intercept=True),
+ 'DecisionTreeClassifier':
+ DecisionTreeClassifier(max_depth=p['max_depth'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state),
+ 'DecisionTreeRegressor':
+ DecisionTreeRegressor(max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state),
+ 'RandomForestClassifier':
+ RandomForestClassifier(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'RandomForestRegressor':
+ RandomForestRegressor(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'ExtraTreesClassifier':
+ ExtraTreesClassifier(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'ExtraTreesRegressor':
+ ExtraTreesRegressor(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'GradientBoostingClassifier':
+ GradientBoostingClassifier(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ subsample=p['subsample'],
+ max_features=p['max_features'],
+ random_state=random_state),
+ 'GradientBoostingRegressor':
+ GradientBoostingRegressor(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ subsample=p['subsample'],
+ max_features=p['max_features'],
+ random_state=random_state),
+ 'GaussianNB': GaussianNB(),
+ 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+ 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+ }
+
+ # define classifier
+ clf = classifiers[estimator]
+
+ # classification or regression
+ if estimator == 'LogisticRegression' \
+ or estimator == 'DecisionTreeClassifier' \
+ or estimator == 'RandomForestClassifier' \
+ or estimator == 'ExtraTreesClassifier' \
+ or estimator == 'GradientBoostingClassifier' \
+ or estimator == 'GaussianNB' \
+ or estimator == 'LinearDiscriminantAnalysis' \
+ or estimator == 'QuadraticDiscriminantAnalysis' \
+ or estimator == 'EarthClassifier' \
+ or estimator == 'XGBClassifier' \
+ or estimator == 'SVC':
+ mode = 'classification'
+ else:
+ mode = 'regression'
+
+ return (clf, mode)
+
+
+def save_training_data(X, y, groups, file):
+ """
+ Saves any extracted training data to a csv file
+
+ Args
+ ----
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels
+ file: Path to a csv file to save data to
+ """
+
+ # if there are no group labels, create a nan filled array
+ if groups is None:
+ groups = np.empty((y.shape[0]))
+ groups[:] = np.nan
+
+ training_data = np.column_stack([X, y, groups])
+ np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+ """
+ Loads training data and labels from a csv file
+
+ Args
+ ----
+ file: Path to a csv file to save data to
+
+ Returns
+ -------
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels, or None
+ """
+
+ training_data = np.loadtxt(file, delimiter=',')
+ n_cols = training_data.shape[1]
+ last_Xcol = n_cols-2
+
+ # check to see if last column contains group labels or nans
+ groups = training_data[:, -1]
+
+ # if all nans then set groups to None
+ if bool(np.isnan(groups).all()) is True:
+ groups = None
+
+ # fetch X and y
+ X = training_data[:, 0:last_Xcol]
+ y = training_data[:, -2]
+
+ return(X, y, groups)
+
+
+def extract(response, predictors, lowmem=False):
+ """
+ Samples a list of GRASS rasters using a labelled raster
+ Per raster sampling
+
+ Args
+ ----
+ response: String; GRASS raster with labelled pixels
+ predictors: List of GRASS rasters containing explanatory variables
+ lowmem: Boolean, use numpy memmap to query predictors
+
+ Returns
+ -------
+ training_data: Numpy array of extracted raster values
+ training_labels: Numpy array of labels
+ is_train: Row and Columns of label positions
+ """
+
+ current = Region()
+
+ # open response raster as rasterrow and read as np array
+ if RasterRow(response).exist() is True:
+ roi_gr = RasterRow(response)
+ roi_gr.open('r')
+
+ if lowmem is False:
+ response_np = np.array(roi_gr)
+ else:
+ response_np = np.memmap(
+ tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+ response_np[:] = np.array(roi_gr)[:]
+ else:
+ gscript.fatal("GRASS response raster does not exist.... exiting")
+
+ # determine number of predictor rasters
+ n_features = len(predictors)
+
+ # check to see if all predictors exist
+ for i in range(n_features):
+ if RasterRow(predictors[i]).exist() is not True:
+ gscript.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # check if any of those pixels are labelled (not equal to nodata)
+ # can use even if roi is FCELL because nodata will be nan
+ is_train = np.nonzero(response_np > -2147483648)
+ training_labels = response_np[is_train]
+ n_labels = np.array(is_train).shape[1]
+
+ # Create a zero numpy array of len training labels
+ if lowmem is False:
+ training_data = np.zeros((n_labels, n_features))
+ else:
+ training_data = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(n_labels, n_features))
+
+ # Loop through each raster and sample pixel values at training indexes
+ if lowmem is True:
+ feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+
+ for f in range(n_features):
+ predictor_gr = RasterRow(predictors[f])
+ predictor_gr.open('r')
+
+ if lowmem is False:
+ feature_np = np.array(predictor_gr)
+ else:
+ feature_np[:] = np.array(predictor_gr)[:]
+
+ training_data[0:n_labels, f] = feature_np[is_train]
+
+ # close each predictor map
+ predictor_gr.close()
+
+ # convert any CELL maps no datavals to NaN in the training data
+ for i in range(n_features):
+ training_data[training_data[:, i] == -2147483648] = np.nan
+
+ # convert indexes of training pixels from tuple to n*2 np array
+ is_train = np.array(is_train).T
+
+ # close the response map
+ roi_gr.close()
+
+ return(training_data, training_labels, is_train)
+
+
+def maps_from_group(group):
+ """
+ Parse individual rasters into a list from an imagery group
+
+ Args
+ ----
+ group: String; GRASS imagery group
+
+ Returns
+ -------
+ maplist: List containing individual GRASS raster maps
+ map_names: List with print friendly map names
+ """
+ groupmaps = im.group(group=group, flags="g",
+ quiet=True, stdout_=PIPE).outputs.stdout
+
+ maplist = groupmaps.split(os.linesep)
+ maplist = maplist[0:len(maplist)-1]
+ map_names = []
+
+ for rastername in maplist:
+ map_names.append(rastername.split('@')[0])
+
+ return(maplist, map_names)
+
+
+def extract_points(gvector, grasters, field):
+ """
+ Extract values from grass rasters using vector points input
+
+ Args
+ ----
+ gvector: character, name of grass points vector
+ grasters: list of names of grass raster to query
+ field: character, name of field in table to use as response variable
+
+ Returns
+ -------
+ X: 2D numpy array of training data
+ y: 1D numpy array with the response variable
+ coordinates: 2D numpy array of sample coordinates
+ """
+ # open grass vector
+ points = VectorTopo(gvector.split('@')[0])
+ points.open('r')
+
+ # create link to attribute table
+ points.dblinks.by_name(name=gvector)
+
+ # extract table field to numpy array
+ table = points.table
+ cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
+ y = np.array([np.isnan if c is None else c[0] for c in cur])
+
+ # extract raster data
+ X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
+ for i, raster in enumerate(grasters):
+ rio = RasterRow(raster)
+ values = np.asarray(get_raster_for_points(points, rio))
+ coordinates = values[:, 1:3]
+ X[:, i] = values[:, 3]
+
+ # set any grass integer nodata values to NaN
+ X[X == -2147483648] = np.nan
+
+ # remove missing response data
+ X = X[~np.isnan(y)]
+ coordinates = coordinates[~np.isnan(y)]
+ y = y[~np.isnan(y)]
+
+ # close
+ points.close()
+
+ return(X, y, coordinates)
More information about the grass-commit
mailing list