[GRASS-SVN] r70911 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Fri Apr 21 20:32:03 PDT 2017
Author: spawley
Date: 2017-04-21 20:32:03 -0700 (Fri, 21 Apr 2017)
New Revision: 70911
Removed:
grass-addons/grass7/raster/r.learn.ml/raster_learning.py
Modified:
grass-addons/grass7/raster/r.learn.ml/Makefile
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
Log:
r.learn.ml reorganized module structure
Modified: grass-addons/grass7/raster/r.learn.ml/Makefile
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/Makefile 2017-04-22 03:25:48 UTC (rev 70910)
+++ grass-addons/grass7/raster/r.learn.ml/Makefile 2017-04-22 03:32:03 UTC (rev 70911)
@@ -1,8 +1,6 @@
MODULE_TOPDIR = ../..
PGM = r.learn.ml
-
-ETCFILES = raster_learning
include $(MODULE_TOPDIR)/include/Make/Script.make
include $(MODULE_TOPDIR)/include/Make/Python.make
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-22 03:25:48 UTC (rev 70910)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-22 03:32:03 UTC (rev 70911)
@@ -356,23 +356,830 @@
import atexit
import os
+import tempfile
+import itertools
+from copy import deepcopy
import numpy as np
+from numpy.random import RandomState
+
import grass.script as grass
-from copy import deepcopy
from grass.pygrass.modules.shortcuts import raster as r
-from grass.pygrass.utils import set_path
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.modules.shortcuts import imagery as im
+from grass.pygrass.vector import VectorTopo
+from grass.pygrass.vector.table import Link
+from grass.pygrass.utils import get_raster_for_points
+from subprocess import PIPE
-set_path('r.learn.ml')
-from raster_learning import model_classifiers
-from raster_learning import save_training_data, load_training_data
-from raster_learning import maps_from_group, predict, cross_val_scores, extract_points
-
tmp_rast = []
def cleanup():
for rast in tmp_rast:
grass.run_command("g.remove", name=rast, type='raster', flags='f', quiet=True)
+def specificity_score(y_true, y_pred):
+ """
+ Function to calculate specificity score
+
+ Args
+ ----
+ y_true: 1D numpy array of truth values
+ y_pred: 1D numpy array of predicted classes
+
+ Returns
+ -------
+ specificity: specificity score
+ """
+ from sklearn.metrics import confusion_matrix
+
+ cm = confusion_matrix(y_true, y_pred)
+ tn = float(cm[0][0])
+ fp = float(cm[0][1])
+
+ return (tn/(tn+fp))
+
+
+def varimp_permutation(estimator, X_test, y_true,
+ n_permutations, scorer,
+ random_state):
+
+ """
+ Method to perform permutation-based feature importance during
+ cross-validation (cross-validation is applied externally to this
+ method)
+
+ Procedure is:
+ 1. Pass fitted estimator and test partition X y
+ 2. Assess AUC on the test partition (bestauc)
+ 3. Permute each variable and assess the difference between bestauc and
+ the messed-up variable
+ 4. Repeat (3) for many random permutations
+ 5. Average the repeats
+
+ Args
+ ----
+ estimator: estimator that has been fitted to a training partition
+ X_test, y_true: data and labels from a test partition
+ n_permutations: number of random permutations to apply
+ scorer: scikit-learn metric function to use
+ random_state: seed to pass to the numpy random.seed
+
+ Returns
+ -------
+ scores: scores for each predictor following permutation
+ """
+
+ # calculate score on original variables without permutation
+ # determine best metric type for binary/multiclass/regression scenarios
+ y_pred = estimator.predict(X_test)
+ best_score = scorer(y_true, y_pred)
+
+ np.random.seed(seed=random_state)
+ rstate = RandomState(random_state)
+ scores = np.zeros((n_permutations, X_test.shape[1]))
+
+ # outer loop to repeat the pemutation rep times
+ for rep in range(n_permutations):
+
+ # inner loop to permute each predictor variable and assess
+ # difference in auc
+ for i in range(X_test.shape[1]):
+ Xscram = np.copy(X_test)
+ Xscram[:, i] = rstate.choice(X_test[:, i], X_test.shape[0])
+
+ # fit the model on the training data and predict the test data
+ y_pred = estimator.predict(Xscram)
+ scores[rep, i] = best_score-scorer(y_true, y_pred)
+ if scores[rep, i] < 0:
+ scores[rep, i] = 0
+
+ # average the repetitions
+ scores = scores.mean(axis=0)
+
+ return(scores)
+
+
+def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
+ scoring=['accuracy'], feature_importances=False,
+ n_permutations=25, random_state=None):
+
+ """
+ Stratified Kfold and GroupFold cross-validation using multiple
+ scoring metrics and permutation feature importances
+
+ Args
+ ----
+ estimator: Scikit learn estimator
+ X: 2D numpy array of training data
+ y: 1D numpy array representing response variable
+ groups: 1D numpy array containing group labels
+ sample_weight: 1D numpy array[n_samples,] of sample weights
+ cv: Integer of cross-validation folds or sklearn.model_selection object
+ sampling: Over- or under-sampling object with fit_sample method
+ scoring: List of performance metrics to use
+ feature_importances: Boolean to perform permutation-based importances
+ n_permutations: Number of permutations during feature importance
+ random_state: Seed to pass to the random number generator
+ """
+
+ from sklearn import metrics
+ from sklearn.model_selection import (
+ RandomizedSearchCV, GridSearchCV, StratifiedKFold)
+
+ estimator = deepcopy(estimator)
+
+ # create model_selection method
+ if isinstance(cv, int):
+ cv = StratifiedKFold(n_splits=cv)
+
+ # fit the model on the training data and predict the test data
+ # need the groups parameter because the estimator can be a
+ # RandomizedSearchCV or GridSearchCV estimator where cv=GroupKFold
+ if isinstance(estimator, RandomizedSearchCV) is True \
+ or isinstance(estimator, GridSearchCV):
+ param_search = True
+ else:
+ param_search = False
+
+ # create dictionary of lists to store metrics
+ scores = dict.fromkeys(scoring)
+ scores = { key: [] for key, value in scores.iteritems()}
+ scoring_methods = {'accuracy': metrics.accuracy_score,
+ 'balanced_accuracy': metrics.recall_score,
+ 'average_precision': metrics.average_precision_score,
+ 'brier_loss': metrics.brier_score_loss,
+ 'kappa': metrics.cohen_kappa_score,
+ 'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'hamming_loss': metrics.hamming_loss,
+ 'jaccard_similarity': metrics.jaccard_similarity_score,
+ 'log_loss': metrics.log_loss,
+ 'matthews_corrcoef': metrics.matthews_corrcoef,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score,
+ 'specificity': specificity_score,
+ 'roc_auc': metrics.roc_auc_score,
+ 'zero_one_loss': metrics.zero_one_loss,
+ 'r2': metrics.r2_score,
+ 'neg_mean_squared_error': metrics.mean_squared_error}
+
+ byclass_methods = {'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score}
+
+ # create diction to store byclass metrics results
+ n_classes = len(np.unique(y))
+ labels = np.unique(y)
+ byclass_scores = dict.fromkeys(byclass_methods)
+ byclass_scores = { key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
+
+ # remove any byclass_scorers that are not in the scoring list
+ byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
+
+ # set averaging type for global binary or multiclass scores
+ if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+ average='binary'
+ else:
+ average='macro'
+
+ # check to see if scoring is a valid sklearn metric
+ for i in scores.keys():
+ try:
+ list(scoring_methods.keys()).index(i)
+ except:
+ print('Scoring ' + i + ' is not a valid scoring method')
+ print('Valid methods are:')
+ print(scoring_methods.keys())
+
+ # create np array to store feature importance scores
+ if feature_importances is True:
+ fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
+ fimp[:] = np.nan
+ else:
+ fimp = None
+
+ # generate Kfold indices
+ if groups is None:
+ k_fold = cv.split(X, y)
+ else:
+ k_fold = cv.split(X, y, groups=groups)
+
+ # train on k-1 folds and test of k folds
+ for train_indices, test_indices in k_fold:
+
+ # subset training and test fold data
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+
+ # subset training and test fold group ids
+ if groups is not None: groups_train = groups[train_indices]
+ else: groups_train = None
+
+ # subset training and test fold sample_weight
+ if sample_weight is not None: weights = sample_weight[train_indices]
+
+ # train estimator on training fold
+ if groups is not None and param_search is True:
+ if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
+ else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
+ else:
+ if sample_weight is None: estimator.fit(X_train, y_train)
+ else: estimator.fit(X_train, y_train, sample_weight=weights)
+
+ # prediction of test fold
+ y_pred = estimator.predict(X_test)
+
+ # calculate global performance metrics
+ for m in scores.keys():
+ # metrics that require probabilties
+ if m == 'brier_loss' or m == 'roc_auc':
+ y_prob = estimator.predict_proba(X_test)[:, 1]
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_prob))
+
+ # metrics that have no averaging for multiclass
+ elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
+ or m == 'hamming_loss' or m == 'jaccard_similarity' \
+ or m == 'log_loss' or m == 'zero_one_loss' or m == 'matthews_corrcoef':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_pred))
+
+ # balanced accuracy
+ elif m == 'balanced_accuracy':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average='macro'))
+
+ # metrics that have averaging for multiclass
+ else:
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average=average))
+
+ # calculate per-class performance metrics
+ for key in byclass_scores.keys():
+ byclass_scores[key] = np.vstack((
+ byclass_scores[key], byclass_methods[key](
+ y_test, y_pred, labels=labels, average=None)))
+
+ # feature importances using permutation
+ if feature_importances is True:
+ if bool((np.isnan(fimp)).all()) is True:
+ fimp = varimp_permutation(
+ estimator, X_test, y_test, n_permutations,
+ scoring_methods[scoring[0]],
+ random_state)
+ else:
+ fimp = np.row_stack(
+ (fimp, varimp_permutation(
+ estimator, X_test, y_test,
+ n_permutations, scoring_methods[scoring[0]],
+ random_state)))
+
+ return(scores, byclass_scores, fimp)
+
+
+def predict(estimator, predictors, output, predict_type='raw', labels=None,
+ index=None, rowincr=25):
+
+ """
+ Prediction on list of GRASS rasters using a fitted scikit learn model
+
+ Args
+ ----
+ estimator: scikit-learn estimator object
+ predictors: list of GRASS rasters
+ output: Name of GRASS raster to output classification results
+ predict_type: character, 'raw' for classification/regression;
+ 'prob' for class probabilities
+ index: Optional, list of class indices to export
+ rowincr: Integer of raster rows to process at one time
+ """
+
+ # current region
+ current = Region()
+
+ # determine output data type and nodata
+ if labels is not None:
+ ftype = 'CELL'
+ nodata = -2147483648
+ else:
+ ftype = 'FCELL'
+ nodata = np.nan
+
+ # open predictors as list of rasterrow objects
+ n_features = len(predictors)
+ rasstack = [0] * n_features
+
+ for i in range(n_features):
+ rasstack[i] = RasterRow(predictors[i])
+ if rasstack[i].exist() is True:
+ rasstack[i].open('r')
+ else:
+ grass.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # create and open RasterRow object for writing of classification result
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftype, overwrite=True)
+
+ # Prediction using row blocks
+ for rowblock in range(0, current.rows, rowincr):
+ grass.percent(rowblock, current.rows, rowincr)
+
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
+
+ # loop through each row, and each band
+ # and add these values to the 2D array img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
+
+ # create mask
+ img_np_row[img_np_row == -2147483648] = np.nan
+ mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+ for feature in range(n_features):
+ invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+ mask[invalid_indexes] = np.nan
+
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaNs prior to passing to scikit-learn predict
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # perform prediction for classification/regression
+ if predict_type == 'raw':
+ result = estimator.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
+
+ # replace NaN values so that the prediction does not have a border
+ result[np.nonzero(np.isnan(mask))] = nodata
+
+ # for each row we can perform computation, and write the result
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
+
+ # perform prediction for class probabilities
+ if predict_type == 'prob':
+ result_proba = estimator.predict_proba(flat_pixels)
+
+ # on first loop determine number of probability classes
+ # and open rasterrow objects for writing
+ if rowblock == 0:
+ if index is None:
+ index = range(result_proba.shape[1])
+ n_classes = len(index)
+ else:
+ n_classes = len(np.unique(index))
+
+ # create and open RasterRow objects for probabilities
+ prob_out_raster = [0] * n_classes
+ prob = [0] * n_classes
+ for iclass, label in enumerate(index):
+ prob_out_raster[iclass] = output + '_classPr' + str(label)
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ for iclass, label in enumerate(index):
+ result_proba_class = result_proba[:, label]
+ result_proba_class = result_proba_class.reshape((rowincr, current.cols))
+
+ # replace NaN values so that the prediction does not have a border
+ result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
+
+ for row in range(rowincr):
+ newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
+ # close all maps
+ for i in range(n_features): rasstack[i].close()
+
+ # close all class probability maps
+ if predict_type == 'raw':
+ classification.close()
+ if predict_type == 'prob':
+ try:
+ for iclass in range(n_classes):
+ prob[iclass].close()
+ except:
+ pass
+
+
+def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
+
+ """
+ Provides the classifiers and parameters using by the module
+
+ Args
+ ----
+ estimator: Name of estimator
+ random_state: Seed to use in randomized components
+ n_jobs: Integer, number of processing cores to use
+ p: Dict, containing classifier setttings
+ weights: None, or 'balanced' to add class_weights
+
+ Returns
+ -------
+ clf: Scikit-learn classifier object
+ mode: Flag to indicate whether classifier performs classification or
+ regression
+ """
+
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+ from sklearn.naive_bayes import GaussianNB
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+ from sklearn.ensemble import (
+ RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
+ ExtraTreesRegressor)
+ from sklearn.ensemble import GradientBoostingClassifier
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.svm import SVC
+
+ # optional packages that add additional classifiers here
+ if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
+ try:
+ from sklearn.pipeline import Pipeline
+ from pyearth import Earth
+
+ earth_classifier = Pipeline([('Earth',
+ Earth(max_degree=p['max_degree'])),
+ ('Logistic', LogisticRegression(n_jobs=n_jobs))])
+
+ classifiers = {'EarthClassifier': earth_classifier,
+ 'EarthRegressor': Earth(max_degree=p['max_degree'])}
+ except:
+ grass.fatal('Py-earth package not installed')
+
+ elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
+ try:
+ from xgboost import XGBClassifier, XGBRegressor
+
+ if p['max_depth'] is None:
+ p['max_depth'] = int(3)
+
+ classifiers = {
+ 'XGBClassifier':
+ XGBClassifier(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ subsample=p['subsample'],
+ nthread=n_jobs),
+ 'XGBRegressor':
+ XGBRegressor(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ subsample=p['subsample'],
+ nthread=n_jobs)}
+ except:
+ grass.fatal('XGBoost package not installed')
+ else:
+ # core sklearn classifiers go here
+ classifiers = {
+ 'SVC': SVC(C=p['C'],
+ class_weight=weights,
+ probability=True,
+ random_state=random_state),
+ 'LogisticRegression':
+ LogisticRegression(C=p['C'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ fit_intercept=True),
+ 'DecisionTreeClassifier':
+ DecisionTreeClassifier(max_depth=p['max_depth'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state),
+ 'DecisionTreeRegressor':
+ DecisionTreeRegressor(max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state),
+ 'RandomForestClassifier':
+ RandomForestClassifier(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'RandomForestRegressor':
+ RandomForestRegressor(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'ExtraTreesClassifier':
+ ExtraTreesClassifier(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'ExtraTreesRegressor':
+ ExtraTreesRegressor(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+
+ 'GradientBoostingClassifier':
+ GradientBoostingClassifier(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ subsample=p['subsample'],
+ max_features=p['max_features'],
+ random_state=random_state),
+ 'GradientBoostingRegressor':
+ GradientBoostingRegressor(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ subsample=p['subsample'],
+ max_features=p['max_features'],
+ random_state=random_state),
+ 'GaussianNB': GaussianNB(),
+ 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+ 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+ }
+
+ # define classifier
+ clf = classifiers[estimator]
+
+ # classification or regression
+ if estimator == 'LogisticRegression' \
+ or estimator == 'DecisionTreeClassifier' \
+ or estimator == 'RandomForestClassifier' \
+ or estimator == 'ExtraTreesClassifier' \
+ or estimator == 'GradientBoostingClassifier' \
+ or estimator == 'GaussianNB' \
+ or estimator == 'LinearDiscriminantAnalysis' \
+ or estimator == 'QuadraticDiscriminantAnalysis' \
+ or estimator == 'EarthClassifier' \
+ or estimator == 'XGBClassifier' \
+ or estimator == 'SVC':
+ mode = 'classification'
+ else:
+ mode = 'regression'
+
+ return (clf, mode)
+
+
+def save_training_data(X, y, groups, file):
+
+ """
+ Saves any extracted training data to a csv file
+
+ Args
+ ----
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels
+ file: Path to a csv file to save data to
+ """
+
+ # if there are no group labels, create a nan filled array
+ if groups is None:
+ groups = np.empty((y.shape[0]))
+ groups[:] = np.nan
+
+ training_data = np.column_stack([X, y, groups])
+ np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+
+ """
+ Loads training data and labels from a csv file
+
+ Args
+ ----
+ file: Path to a csv file to save data to
+
+ Returns
+ -------
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels, or None
+ """
+
+ training_data = np.loadtxt(file, delimiter=',')
+ n_cols = training_data.shape[1]
+ last_Xcol = n_cols-2
+
+ # check to see if last column contains group labels or nans
+ groups = training_data[:, -1]
+
+ # if all nans then set groups to None
+ if bool(np.isnan(groups).all()) is True:
+ groups = None
+
+ # fetch X and y
+ X = training_data[:, 0:last_Xcol]
+ y = training_data[:, -2]
+
+ return(X, y, groups)
+
+
+def extract(response, predictors, lowmem=False):
+
+ """
+ Samples a list of GRASS rasters using a labelled raster
+ Per raster sampling
+
+ Args
+ ----
+ response: String; GRASS raster with labelled pixels
+ predictors: List of GRASS rasters containing explanatory variables
+ lowmem: Boolean, use numpy memmap to query predictors
+
+ Returns
+ -------
+
+ training_data: Numpy array of extracted raster values
+ training_labels: Numpy array of labels
+ is_train: Row and Columns of label positions
+ """
+
+ current = Region()
+
+ # open response raster as rasterrow and read as np array
+ if RasterRow(response).exist() is True:
+ roi_gr = RasterRow(response)
+ roi_gr.open('r')
+
+ if lowmem is False:
+ response_np = np.array(roi_gr)
+ else:
+ response_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+ response_np[:] = np.array(roi_gr)[:]
+ else:
+ grass.fatal("GRASS response raster does not exist.... exiting")
+
+ # determine number of predictor rasters
+ n_features = len(predictors)
+
+ # check to see if all predictors exist
+ for i in range(n_features):
+ if RasterRow(predictors[i]).exist() is not True:
+ grass.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # check if any of those pixels are labelled (not equal to nodata)
+ # can use even if roi is FCELL because nodata will be nan
+ is_train = np.nonzero(response_np > -2147483648)
+ training_labels = response_np[is_train]
+ n_labels = np.array(is_train).shape[1]
+
+ # Create a zero numpy array of len training labels
+ if lowmem is False:
+ training_data = np.zeros((n_labels, n_features))
+ else:
+ training_data = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(n_labels, n_features))
+
+ # Loop through each raster and sample pixel values at training indexes
+ if lowmem is True:
+ feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+
+ for f in range(n_features):
+ predictor_gr = RasterRow(predictors[f])
+ predictor_gr.open('r')
+
+ if lowmem is False:
+ feature_np = np.array(predictor_gr)
+ else:
+ feature_np[:] = np.array(predictor_gr)[:]
+
+ training_data[0:n_labels, f] = feature_np[is_train]
+
+ # close each predictor map
+ predictor_gr.close()
+
+ # convert any CELL maps no datavals to NaN in the training data
+ for i in range(n_features):
+ training_data[training_data[:, i] == -2147483648] = np.nan
+
+ # convert indexes of training pixels from tuple to n*2 np array
+ is_train = np.array(is_train).T
+
+ # close the response map
+ roi_gr.close()
+
+ return(training_data, training_labels, is_train)
+
+
+def maps_from_group(group):
+
+ """
+ Parse individual rasters into a list from an imagery group
+
+ Args
+ ----
+ group: String; GRASS imagery group
+ Returns
+ -------
+ maplist: Python list containing individual GRASS raster maps
+ """
+ groupmaps = im.group(group=group, flags="g",
+ quiet=True, stdout_=PIPE).outputs.stdout
+
+ maplist = groupmaps.split(os.linesep)
+ maplist = maplist[0:len(maplist)-1]
+ map_names = []
+
+ for rastername in maplist:
+ map_names.append(rastername.split('@')[0])
+
+ return(maplist, map_names)
+
+
+def extract_points(gvector, grasters, field):
+ """
+ Extract values from grass rasters using vector points input
+
+ Args
+ ----
+ gvector: character, name of grass points vector
+ grasters: list of names of grass raster to query
+ field: character, name of field in table to use as response variable
+
+ Returns
+ -------
+ X: 2D numpy array of training data
+ y: 1D numpy array with the response variable
+ coordinates: 2D numpy array of sample coordinates
+ """
+
+ import pandas as pd
+
+ # open grass vector
+ points = VectorTopo(gvector.split('@')[0])
+ points.open('r')
+
+ # create link to attribute table
+ points.dblinks.by_name(name=gvector)
+ link = points.dblinks[0]
+
+ # convert to pandas array
+ gvector_df = pd.DataFrame(points.table_to_dict()).T
+ gvector_df.columns = points.table.columns
+ y = gvector_df.loc[:, field].as_matrix()
+ y = y.astype(float)
+
+ # extract training data
+ X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
+ for i, raster in enumerate(grasters):
+ rio = RasterRow(raster)
+ values = np.asarray(get_raster_for_points(points, rio))
+ coordinates = values[:, 1:3]
+ X[:, i] = values[:, 3]
+
+ # set any grass integer nodata values to NaN
+ X[X == -2147483648] = np.nan
+
+ # remove missing response data
+ X = X[~np.isnan(y)]
+ coordinates = coordinates[~np.isnan(y)]
+ y = y[~np.isnan(y)]
+
+ # close
+ points.close()
+
+ return(X, y, coordinates)
+
def main():
try:
from sklearn.externals import joblib
Deleted: grass-addons/grass7/raster/r.learn.ml/raster_learning.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-04-22 03:25:48 UTC (rev 70910)
+++ grass-addons/grass7/raster/r.learn.ml/raster_learning.py 2017-04-22 03:32:03 UTC (rev 70911)
@@ -1,818 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import os
-import numpy as np
-from numpy.random import RandomState
-import tempfile
-import itertools
-from copy import deepcopy
-import grass.script as grass
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
-from grass.pygrass.modules.shortcuts import imagery as im
-from grass.pygrass.vector import VectorTopo
-from grass.pygrass.vector.table import Link
-from grass.pygrass.utils import get_raster_for_points
-from subprocess import PIPE
-
-def specificity_score(y_true, y_pred):
- """
- Function to calculate specificity score
-
- Args
- ----
- y_true: 1D numpy array of truth values
- y_pred: 1D numpy array of predicted classes
-
- Returns
- -------
- specificity: specificity score
- """
- from sklearn.metrics import confusion_matrix
-
- cm = confusion_matrix(y_true, y_pred)
- tn = float(cm[0][0])
- fp = float(cm[0][1])
-
- return (tn/(tn+fp))
-
-
-def varimp_permutation(estimator, X_test, y_true,
- n_permutations, scorer,
- random_state):
-
- """
- Method to perform permutation-based feature importance during
- cross-validation (cross-validation is applied externally to this
- method)
-
- Procedure is:
- 1. Pass fitted estimator and test partition X y
- 2. Assess AUC on the test partition (bestauc)
- 3. Permute each variable and assess the difference between bestauc and
- the messed-up variable
- 4. Repeat (3) for many random permutations
- 5. Average the repeats
-
- Args
- ----
- estimator: estimator that has been fitted to a training partition
- X_test, y_true: data and labels from a test partition
- n_permutations: number of random permutations to apply
- scorer: scikit-learn metric function to use
- random_state: seed to pass to the numpy random.seed
-
- Returns
- -------
- scores: scores for each predictor following permutation
- """
-
- # calculate score on original variables without permutation
- # determine best metric type for binary/multiclass/regression scenarios
- y_pred = estimator.predict(X_test)
- best_score = scorer(y_true, y_pred)
-
- np.random.seed(seed=random_state)
- rstate = RandomState(random_state)
- scores = np.zeros((n_permutations, X_test.shape[1]))
-
- # outer loop to repeat the pemutation rep times
- for rep in range(n_permutations):
-
- # inner loop to permute each predictor variable and assess
- # difference in auc
- for i in range(X_test.shape[1]):
- Xscram = np.copy(X_test)
- Xscram[:, i] = rstate.choice(X_test[:, i], X_test.shape[0])
-
- # fit the model on the training data and predict the test data
- y_pred = estimator.predict(Xscram)
- scores[rep, i] = best_score-scorer(y_true, y_pred)
- if scores[rep, i] < 0:
- scores[rep, i] = 0
-
- # average the repetitions
- scores = scores.mean(axis=0)
-
- return(scores)
-
-
-def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
- scoring=['accuracy'], feature_importances=False,
- n_permutations=25, random_state=None):
-
- """
- Stratified Kfold and GroupFold cross-validation using multiple
- scoring metrics and permutation feature importances
-
- Args
- ----
- estimator: Scikit learn estimator
- X: 2D numpy array of training data
- y: 1D numpy array representing response variable
- groups: 1D numpy array containing group labels
- sample_weight: 1D numpy array[n_samples,] of sample weights
- cv: Integer of cross-validation folds or sklearn.model_selection object
- sampling: Over- or under-sampling object with fit_sample method
- scoring: List of performance metrics to use
- feature_importances: Boolean to perform permutation-based importances
- n_permutations: Number of permutations during feature importance
- random_state: Seed to pass to the random number generator
- """
-
- from sklearn import metrics
- from sklearn.model_selection import (
- RandomizedSearchCV, GridSearchCV, StratifiedKFold)
-
- estimator = deepcopy(estimator)
-
- # create model_selection method
- if isinstance(cv, int):
- cv = StratifiedKFold(n_splits=cv)
-
- # fit the model on the training data and predict the test data
- # need the groups parameter because the estimator can be a
- # RandomizedSearchCV or GridSearchCV estimator where cv=GroupKFold
- if isinstance(estimator, RandomizedSearchCV) is True \
- or isinstance(estimator, GridSearchCV):
- param_search = True
- else:
- param_search = False
-
- # create dictionary of lists to store metrics
- scores = dict.fromkeys(scoring)
- scores = { key: [] for key, value in scores.iteritems()}
- scoring_methods = {'accuracy': metrics.accuracy_score,
- 'balanced_accuracy': metrics.recall_score,
- 'average_precision': metrics.average_precision_score,
- 'brier_loss': metrics.brier_score_loss,
- 'kappa': metrics.cohen_kappa_score,
- 'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'hamming_loss': metrics.hamming_loss,
- 'jaccard_similarity': metrics.jaccard_similarity_score,
- 'log_loss': metrics.log_loss,
- 'matthews_corrcoef': metrics.matthews_corrcoef,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score,
- 'specificity': specificity_score,
- 'roc_auc': metrics.roc_auc_score,
- 'zero_one_loss': metrics.zero_one_loss,
- 'r2': metrics.r2_score,
- 'neg_mean_squared_error': metrics.mean_squared_error}
-
- byclass_methods = {'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score}
-
- # create diction to store byclass metrics results
- n_classes = len(np.unique(y))
- labels = np.unique(y)
- byclass_scores = dict.fromkeys(byclass_methods)
- byclass_scores = { key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
-
- # remove any byclass_scorers that are not in the scoring list
- byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
-
- # set averaging type for global binary or multiclass scores
- if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
- average='binary'
- else:
- average='macro'
-
- # check to see if scoring is a valid sklearn metric
- for i in scores.keys():
- try:
- list(scoring_methods.keys()).index(i)
- except:
- print('Scoring ' + i + ' is not a valid scoring method')
- print('Valid methods are:')
- print(scoring_methods.keys())
-
- # create np array to store feature importance scores
- if feature_importances is True:
- fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
- fimp[:] = np.nan
- else:
- fimp = None
-
- # generate Kfold indices
- if groups is None:
- k_fold = cv.split(X, y)
- else:
- k_fold = cv.split(X, y, groups=groups)
-
- # train on k-1 folds and test of k folds
- for train_indices, test_indices in k_fold:
-
- # subset training and test fold data
- X_train, X_test = X[train_indices], X[test_indices]
- y_train, y_test = y[train_indices], y[test_indices]
-
- # subset training and test fold group ids
- if groups is not None: groups_train = groups[train_indices]
- else: groups_train = None
-
- # subset training and test fold sample_weight
- if sample_weight is not None: weights = sample_weight[train_indices]
-
- # train estimator on training fold
- if groups is not None and param_search is True:
- if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
- else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
- else:
- if sample_weight is None: estimator.fit(X_train, y_train)
- else: estimator.fit(X_train, y_train, sample_weight=weights)
-
- # prediction of test fold
- y_pred = estimator.predict(X_test)
-
- # calculate global performance metrics
- for m in scores.keys():
- # metrics that require probabilties
- if m == 'brier_loss' or m == 'roc_auc':
- y_prob = estimator.predict_proba(X_test)[:, 1]
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_prob))
-
- # metrics that have no averaging for multiclass
- elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
- or m == 'hamming_loss' or m == 'jaccard_similarity' \
- or m == 'log_loss' or m == 'zero_one_loss' or m == 'matthews_corrcoef':
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_pred))
-
- # balanced accuracy
- elif m == 'balanced_accuracy':
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average='macro'))
-
- # metrics that have averaging for multiclass
- else:
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average=average))
-
- # calculate per-class performance metrics
- for key in byclass_scores.keys():
- byclass_scores[key] = np.vstack((
- byclass_scores[key], byclass_methods[key](
- y_test, y_pred, labels=labels, average=None)))
-
- # feature importances using permutation
- if feature_importances is True:
- if bool((np.isnan(fimp)).all()) is True:
- fimp = varimp_permutation(
- estimator, X_test, y_test, n_permutations,
- scoring_methods[scoring[0]],
- random_state)
- else:
- fimp = np.row_stack(
- (fimp, varimp_permutation(
- estimator, X_test, y_test,
- n_permutations, scoring_methods[scoring[0]],
- random_state)))
-
- return(scores, byclass_scores, fimp)
-
-
-def predict(estimator, predictors, output, predict_type='raw', labels=None,
- index=None, rowincr=25):
-
- """
- Prediction on list of GRASS rasters using a fitted scikit learn model
-
- Args
- ----
- estimator: scikit-learn estimator object
- predictors: list of GRASS rasters
- output: Name of GRASS raster to output classification results
- predict_type: character, 'raw' for classification/regression;
- 'prob' for class probabilities
- index: Optional, list of class indices to export
- rowincr: Integer of raster rows to process at one time
- """
-
- # current region
- current = Region()
-
- # determine output data type and nodata
- if labels is not None:
- ftype = 'CELL'
- nodata = -2147483648
- else:
- ftype = 'FCELL'
- nodata = np.nan
-
- # open predictors as list of rasterrow objects
- n_features = len(predictors)
- rasstack = [0] * n_features
-
- for i in range(n_features):
- rasstack[i] = RasterRow(predictors[i])
- if rasstack[i].exist() is True:
- rasstack[i].open('r')
- else:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # create and open RasterRow object for writing of classification result
- if predict_type == 'raw':
- classification = RasterRow(output)
- classification.open('w', ftype, overwrite=True)
-
- # Prediction using row blocks
- for rowblock in range(0, current.rows, rowincr):
- grass.percent(rowblock, current.rows, rowincr)
-
- # check that the row increment does not exceed the number of rows
- if rowblock+rowincr > current.rows:
- rowincr = current.rows - rowblock
- img_np_row = np.zeros((rowincr, current.cols, n_features))
-
- # loop through each row, and each band
- # and add these values to the 2D array img_np_row
- for row in range(rowblock, rowblock+rowincr, 1):
- for band in range(n_features):
- img_np_row[row-rowblock, :, band] = \
- np.array(rasstack[band][row])
-
- # create mask
- img_np_row[img_np_row == -2147483648] = np.nan
- mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
- for feature in range(n_features):
- invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
- mask[invalid_indexes] = np.nan
-
- # reshape each row-band matrix into a n*m array
- nsamples = rowincr * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
-
- # remove NaNs prior to passing to scikit-learn predict
- flat_pixels = np.nan_to_num(flat_pixels)
-
- # perform prediction for classification/regression
- if predict_type == 'raw':
- result = estimator.predict(flat_pixels)
- result = result.reshape((rowincr, current.cols))
-
- # replace NaN values so that the prediction does not have a border
- result[np.nonzero(np.isnan(mask))] = nodata
-
- # for each row we can perform computation, and write the result
- for row in range(rowincr):
- newrow = Buffer((result.shape[1],), mtype=ftype)
- newrow[:] = result[row, :]
- classification.put_row(newrow)
-
- # perform prediction for class probabilities
- if predict_type == 'prob':
- result_proba = estimator.predict_proba(flat_pixels)
-
- # on first loop determine number of probability classes
- # and open rasterrow objects for writing
- if rowblock == 0:
- if index is None:
- index = range(result_proba.shape[1])
- n_classes = len(index)
- else:
- n_classes = len(np.unique(index))
-
- # create and open RasterRow objects for probabilities
- prob_out_raster = [0] * n_classes
- prob = [0] * n_classes
- for iclass, label in enumerate(index):
- prob_out_raster[iclass] = output + '_classPr' + str(label)
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
-
- for iclass, label in enumerate(index):
- result_proba_class = result_proba[:, label]
- result_proba_class = result_proba_class.reshape((rowincr, current.cols))
-
- # replace NaN values so that the prediction does not have a border
- result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
-
- for row in range(rowincr):
- newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
-
- # close all maps
- for i in range(n_features): rasstack[i].close()
-
- # close all class probability maps
- if predict_type == 'raw':
- classification.close()
- if predict_type == 'prob':
- try:
- for iclass in range(n_classes):
- prob[iclass].close()
- except:
- pass
-
-
-def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
-
- """
- Provides the classifiers and parameters using by the module
-
- Args
- ----
- estimator: Name of estimator
- random_state: Seed to use in randomized components
- n_jobs: Integer, number of processing cores to use
- p: Dict, containing classifier setttings
- weights: None, or 'balanced' to add class_weights
-
- Returns
- -------
- clf: Scikit-learn classifier object
- mode: Flag to indicate whether classifier performs classification or
- regression
- """
-
- from sklearn.linear_model import LogisticRegression
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
- from sklearn.ensemble import (
- RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
- ExtraTreesRegressor)
- from sklearn.ensemble import GradientBoostingClassifier
- from sklearn.ensemble import GradientBoostingRegressor
- from sklearn.svm import SVC
-
- # optional packages that add additional classifiers here
- if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
- try:
- from sklearn.pipeline import Pipeline
- from pyearth import Earth
-
- earth_classifier = Pipeline([('Earth',
- Earth(max_degree=p['max_degree'])),
- ('Logistic', LogisticRegression(n_jobs=n_jobs))])
-
- classifiers = {'EarthClassifier': earth_classifier,
- 'EarthRegressor': Earth(max_degree=p['max_degree'])}
- except:
- grass.fatal('Py-earth package not installed')
-
- elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
- try:
- from xgboost import XGBClassifier, XGBRegressor
-
- if p['max_depth'] is None:
- p['max_depth'] = int(3)
-
- classifiers = {
- 'XGBClassifier':
- XGBClassifier(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- subsample=p['subsample'],
- nthread=n_jobs),
- 'XGBRegressor':
- XGBRegressor(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- subsample=p['subsample'],
- nthread=n_jobs)}
- except:
- grass.fatal('XGBoost package not installed')
- else:
- # core sklearn classifiers go here
- classifiers = {
- 'SVC': SVC(C=p['C'],
- class_weight=weights,
- probability=True,
- random_state=random_state),
- 'LogisticRegression':
- LogisticRegression(C=p['C'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- fit_intercept=True),
- 'DecisionTreeClassifier':
- DecisionTreeClassifier(max_depth=p['max_depth'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state),
- 'DecisionTreeRegressor':
- DecisionTreeRegressor(max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state),
- 'RandomForestClassifier':
- RandomForestClassifier(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'RandomForestRegressor':
- RandomForestRegressor(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'ExtraTreesClassifier':
- ExtraTreesClassifier(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'ExtraTreesRegressor':
- ExtraTreesRegressor(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
-
- 'GradientBoostingClassifier':
- GradientBoostingClassifier(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- subsample=p['subsample'],
- max_features=p['max_features'],
- random_state=random_state),
- 'GradientBoostingRegressor':
- GradientBoostingRegressor(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- subsample=p['subsample'],
- max_features=p['max_features'],
- random_state=random_state),
- 'GaussianNB': GaussianNB(),
- 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
- 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
- }
-
- # define classifier
- clf = classifiers[estimator]
-
- # classification or regression
- if estimator == 'LogisticRegression' \
- or estimator == 'DecisionTreeClassifier' \
- or estimator == 'RandomForestClassifier' \
- or estimator == 'ExtraTreesClassifier' \
- or estimator == 'GradientBoostingClassifier' \
- or estimator == 'GaussianNB' \
- or estimator == 'LinearDiscriminantAnalysis' \
- or estimator == 'QuadraticDiscriminantAnalysis' \
- or estimator == 'EarthClassifier' \
- or estimator == 'XGBClassifier' \
- or estimator == 'SVC':
- mode = 'classification'
- else:
- mode = 'regression'
-
- return (clf, mode)
-
-
-def save_training_data(X, y, groups, file):
-
- """
- Saves any extracted training data to a csv file
-
- Args
- ----
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels
- file: Path to a csv file to save data to
- """
-
- # if there are no group labels, create a nan filled array
- if groups is None:
- groups = np.empty((y.shape[0]))
- groups[:] = np.nan
-
- training_data = np.column_stack([X, y, groups])
- np.savetxt(file, training_data, delimiter=',')
-
-
-def load_training_data(file):
-
- """
- Loads training data and labels from a csv file
-
- Args
- ----
- file: Path to a csv file to save data to
-
- Returns
- -------
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels, or None
- """
-
- training_data = np.loadtxt(file, delimiter=',')
- n_cols = training_data.shape[1]
- last_Xcol = n_cols-2
-
- # check to see if last column contains group labels or nans
- groups = training_data[:, -1]
-
- # if all nans then set groups to None
- if bool(np.isnan(groups).all()) is True:
- groups = None
-
- # fetch X and y
- X = training_data[:, 0:last_Xcol]
- y = training_data[:, -2]
-
- return(X, y, groups)
-
-
-def extract(response, predictors, lowmem=False):
-
- """
- Samples a list of GRASS rasters using a labelled raster
- Per raster sampling
-
- Args
- ----
- response: String; GRASS raster with labelled pixels
- predictors: List of GRASS rasters containing explanatory variables
- lowmem: Boolean, use numpy memmap to query predictors
-
- Returns
- -------
-
- training_data: Numpy array of extracted raster values
- training_labels: Numpy array of labels
- is_train: Row and Columns of label positions
- """
-
- current = Region()
-
- # open response raster as rasterrow and read as np array
- if RasterRow(response).exist() is True:
- roi_gr = RasterRow(response)
- roi_gr.open('r')
-
- if lowmem is False:
- response_np = np.array(roi_gr)
- else:
- response_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
- response_np[:] = np.array(roi_gr)[:]
- else:
- grass.fatal("GRASS response raster does not exist.... exiting")
-
- # determine number of predictor rasters
- n_features = len(predictors)
-
- # check to see if all predictors exist
- for i in range(n_features):
- if RasterRow(predictors[i]).exist() is not True:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan
- is_train = np.nonzero(response_np > -2147483648)
- training_labels = response_np[is_train]
- n_labels = np.array(is_train).shape[1]
-
- # Create a zero numpy array of len training labels
- if lowmem is False:
- training_data = np.zeros((n_labels, n_features))
- else:
- training_data = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(n_labels, n_features))
-
- # Loop through each raster and sample pixel values at training indexes
- if lowmem is True:
- feature_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
-
- for f in range(n_features):
- predictor_gr = RasterRow(predictors[f])
- predictor_gr.open('r')
-
- if lowmem is False:
- feature_np = np.array(predictor_gr)
- else:
- feature_np[:] = np.array(predictor_gr)[:]
-
- training_data[0:n_labels, f] = feature_np[is_train]
-
- # close each predictor map
- predictor_gr.close()
-
- # convert any CELL maps no datavals to NaN in the training data
- for i in range(n_features):
- training_data[training_data[:, i] == -2147483648] = np.nan
-
- # convert indexes of training pixels from tuple to n*2 np array
- is_train = np.array(is_train).T
-
- # close the response map
- roi_gr.close()
-
- return(training_data, training_labels, is_train)
-
-
-def maps_from_group(group):
-
- """
- Parse individual rasters into a list from an imagery group
-
- Args
- ----
- group: String; GRASS imagery group
- Returns
- -------
- maplist: Python list containing individual GRASS raster maps
- """
- groupmaps = im.group(group=group, flags="g",
- quiet=True, stdout_=PIPE).outputs.stdout
-
- maplist = groupmaps.split(os.linesep)
- maplist = maplist[0:len(maplist)-1]
- map_names = []
-
- for rastername in maplist:
- map_names.append(rastername.split('@')[0])
-
- return(maplist, map_names)
-
-
-def extract_points(gvector, grasters, field):
- """
- Extract values from grass rasters using vector points input
-
- Args
- ----
- gvector: character, name of grass points vector
- grasters: list of names of grass raster to query
- field: character, name of field in table to use as response variable
-
- Returns
- -------
- X: 2D numpy array of training data
- y: 1D numpy array with the response variable
- coordinates: 2D numpy array of sample coordinates
- """
-
- import pandas as pd
-
- # open grass vector
- points = VectorTopo(gvector.split('@')[0])
- points.open('r')
-
- # create link to attribute table
- points.dblinks.by_name(name=gvector)
- link = points.dblinks[0]
-
- # convert to pandas array
- gvector_df = pd.DataFrame(points.table_to_dict()).T
- gvector_df.columns = points.table.columns
- y = gvector_df.loc[:, field].as_matrix()
- y = y.astype(float)
-
- # extract training data
- X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
- for i, raster in enumerate(grasters):
- rio = RasterRow(raster)
- values = np.asarray(get_raster_for_points(points, rio))
- coordinates = values[:, 1:3]
- X[:, i] = values[:, 3]
-
- # set any grass integer nodata values to NaN
- X[X == -2147483648] = np.nan
-
- # remove missing response data
- X = X[~np.isnan(y)]
- coordinates = coordinates[~np.isnan(y)]
- y = y[~np.isnan(y)]
-
- # close
- points.close()
-
- return(X, y, coordinates)
More information about the grass-commit
mailing list