[GRASS-SVN] r71157 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Thu Jun 1 15:03:12 PDT 2017
Author: spawley
Date: 2017-06-01 15:03:12 -0700 (Thu, 01 Jun 2017)
New Revision: 71157
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py
Log:
r.learn.ml close all raster maps properly
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-06-01 22:03:12 UTC (rev 71157)
@@ -448,6 +448,7 @@
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import make_scorer
+ from sklearn.calibration import CalibratedClassifierCV
except:
gscript.fatal("Scikit learn 0.18 or newer is not installed")
@@ -577,7 +578,7 @@
if mode == 'classification':
scoring = ['accuracy', 'precision', 'recall', 'f1', 'kappa',\
'balanced_accuracy']
- search_scorer = make_scorer(metrics.cohen_kappa_score)
+ search_scorer = make_scorer(metrics.matthews_corrcoef)
else:
scoring = ['r2', 'neg_mean_squared_error']
search_scorer = 'r2'
@@ -587,7 +588,7 @@
# -------------------------------------------------------------------------
# fetch individual raster names from group
- maplist, map_names = maps_from_group(group)
+ maplist, _ = maps_from_group(group)
if model_load == '':
@@ -712,7 +713,6 @@
# ---------------------------------------------------------------------
# define the preprocessing pipeline
# ---------------------------------------------------------------------
-
# standardization
if norm_data is True and categorymaps is None:
clf = Pipeline([('scaling', StandardScaler()),
@@ -760,12 +760,11 @@
# ---------------------------------------------------------------------
# classifier training
# ---------------------------------------------------------------------
-
gscript.message(os.linesep)
gscript.message(('Fitting model using ' + classifier))
# pass groups to fit parameter GroupKFold/GroupShuffleSplit and param_grid are present
- if isinstance(inner, (GroupKFold, GroupShuffleSplit)) and any(param_grid) is True:
+ if isinstance(inner, (GroupKFold, GroupShuffleSplit)):
if balance is True and classifier in (
'GradientBoostingClassifier', 'XGBClassifier'):
clf.fit(X=X, y=y, groups=group_id, sample_weight=class_weights)
@@ -792,7 +791,6 @@
# ---------------------------------------------------------------------
# cross-validation
# ---------------------------------------------------------------------
-
# If cv > 1 then use cross-validation to generate performance measures
if cv > 1 and tune_only is not True:
if mode == 'classification' and cv > np.histogram(
@@ -885,6 +883,13 @@
if model_only is not True:
gscript.message(os.linesep)
+ # recalibrate probabilities if classes have been balanced
+ if balance is True:
+ if any(param_grid) is True:
+ clf = clf.best_estimator_
+ clf = CalibratedClassifierCV(clf, cv=outer)
+ clf.fit(X, y)
+
# predict classification/regression raster
if prob_only is False:
gscript.message('Predicting classification/regression raster...')
Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py 2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py 2017-06-01 22:03:12 UTC (rev 71157)
@@ -1,350 +1,350 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_crossval contains functions to perform
-model validation and permutation feature importances.
-
-"""
-
-from __future__ import absolute_import
-from copy import deepcopy
-
-import numpy as np
-import os
-from numpy.random import RandomState
-
-import grass.script as gscript
-
-def specificity_score(y_true, y_pred):
- """
- Calculate specificity score
-
- Args
- ----
- y_true: 1D numpy array of truth values
- y_pred: 1D numpy array of predicted classes
-
- Returns
- -------
- specificity: specificity score
- """
- from sklearn.metrics import confusion_matrix
-
- cm = confusion_matrix(y_true, y_pred)
- tn = float(cm[0][0])
- fp = float(cm[0][1])
-
- return tn/(tn+fp)
-
-
-def varimp_permutation(estimator, X, y, n_permutations, scorer,
- n_jobs, random_state):
- """
- Method to perform permutation-based feature importance during
- cross-validation (cross-validation is applied externally to this
- method)
-
- Procedure is:
- 1. Pass fitted estimator and test partition X y
- 2. Assess AUC on the test partition (bestauc)
- 3. Permute each variable and assess the difference between bestauc and
- the messed-up variable
- 4. Repeat (3) for many random permutations
- 5. Average the repeats
-
- Args
- ----
- estimator: estimator that has been fitted to a training partition
- X, y: data and labels from a test partition
- n_permutations: number of random permutations to apply
- scorer: scikit-learn metric function to use
- n_jobs: integer, number of processing cores
- random_state: seed to pass to the numpy random.seed
-
- Returns
- -------
- scores: scores for each predictor following permutation
- """
-
- from sklearn.externals.joblib import Parallel, delayed
-
- # calculate score on original variables without permutation
- # determine best metric type for binary/multiclass/regression scenarios
- y_pred = estimator.predict(X)
- best_score = scorer(y, y_pred)
-
- # repeated permutations and return difference from best score per predictor
- scores = Parallel(n_jobs=n_jobs)(
- delayed(__permute)(
- estimator, X, y, best_score, scorer, random_state)
- for n in range(n_permutations))
-
- # average the repetitions
- scores = np.asarray(scores)
- scores = scores.mean(axis=0)
-
- return scores
-
-
-def __permute(estimator, X, y, best_score, scorer, random_state):
- """
- Permute each predictor and measure difference from best score
-
- Args
- ----
- estimator: scikit learn estimator
- X, y: data and labels from a test partition
- best_score: best scorer obtained on unperturbed data
- scorer: scoring method to use to measure importances
- random_state: random seed
-
- Returns
- -------
- scores: 2D numpy array of scores for each predictor following permutation
- """
-
- rstate = RandomState(random_state)
-
- # permute each predictor variable and assess difference in score
- scores = np.zeros(X.shape[1])
-
- for i in range(X.shape[1]):
- Xscram = np.copy(X)
- Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])
-
- # fit the model on the training data and predict the test data
- y_pred = estimator.predict(Xscram)
- scores[i] = best_score-scorer(y, y_pred)
- if scores[i] < 0:
- scores[i] = 0
-
- return scores
-
-
-def __parallel_fit(estimator, X, y, groups, train_indices, test_indices, sample_weight):
- """
- Fit classifiers/regressors in parallel
-
- Args
- ----
- estimator: scikit learn estimator
- X, y: 2D and 1D numpy arrays of training data and labels
- groups: 1D numpy array of len(y) containing group labels
- train_indices, test_indices: 1D numpy arrays of indices to use for training/validation
- sample_weight: 1D numpy array of len(y) containing weights to use during fitting
- applied only to XGBoost and Gradient Boosting classifiers
- """
-
- # create training and test folds
- X_train, X_test = X[train_indices], X[test_indices]
- y_train, y_test = y[train_indices], y[test_indices]
- if groups is not None: groups_train = groups[train_indices]
- else: groups_train = None
-
- # subset training and test fold sample_weight
- if sample_weight is not None: weights = sample_weight[train_indices]
-
- # train estimator
- if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
- if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
- else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
- else:
- if sample_weight is None: estimator.fit(X_train, y_train)
- else: estimator.fit(X_train, y_train, sample_weight=weights)
-
- return estimator
-
-
-def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
- scoring='accuracy', feature_importances=False,
- n_permutations=25, random_state=None, n_jobs=-1):
- """
- Stratified Kfold and GroupFold cross-validation using multiple
- scoring metrics and permutation feature importances
-
- Args
- ----
- estimator: Scikit learn estimator
- X, y: 2D and 1D numpy array of training data and labels
- groups: 1D numpy array containing group labels
- sample_weight: 1D numpy array[n_samples,] of sample weights
- cv: Integer of cross-validation folds or sklearn.model_selection object
- scoring: List of performance metrics to use
- feature_importances: Boolean to perform permutation-based importances
- n_permutations: Number of permutations during feature importance
- random_state: Seed to pass to the random number generator
-
- Returns
- -------
- scores: Dict, containing lists of scores per cross-validation fold
- byclass_scores: Dict, containing scores per class
- fimp: 2D numpy array of permutation feature importances per feature
- clf_resamples: List, fitted estimators
- predictions: 2D numpy array with y_true, y_pred, fold
- """
-
- from sklearn import metrics
- from sklearn.model_selection import StratifiedKFold
- from sklearn.externals.joblib import Parallel, delayed
-
- # -------------------------------------------------------------------------
- # create copies of estimator and create cross-validation iterator
- # -------------------------------------------------------------------------
-
- # deepcopy estimator
- clf = deepcopy(estimator)
-
- # create model_selection method
- if isinstance(cv, int):
- cv = StratifiedKFold(n_splits=cv)
-
- # -------------------------------------------------------------------------
- # create dictionary of lists to store metrics
- # -------------------------------------------------------------------------
-
- if isinstance(scoring, basestring):
- scoring = [scoring]
- scores = dict.fromkeys(scoring)
- scores = {key: [] for key, value in scores.iteritems()}
- scoring_methods = {'accuracy': metrics.accuracy_score,
- 'balanced_accuracy': metrics.recall_score,
- 'average_precision': metrics.average_precision_score,
- 'brier_loss': metrics.brier_score_loss,
- 'kappa': metrics.cohen_kappa_score,
- 'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'hamming_loss': metrics.hamming_loss,
- 'jaccard_similarity': metrics.jaccard_similarity_score,
- 'log_loss': metrics.log_loss,
- 'matthews_corrcoef': metrics.matthews_corrcoef,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score,
- 'specificity': specificity_score,
- 'roc_auc': metrics.roc_auc_score,
- 'zero_one_loss': metrics.zero_one_loss,
- 'r2': metrics.r2_score,
- 'neg_mean_squared_error': metrics.mean_squared_error}
-
- byclass_methods = {'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score}
-
- # create dict to store byclass metrics results
- n_classes = len(np.unique(y))
- labels = np.unique(y)
- byclass_scores = dict.fromkeys(byclass_methods)
- byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
-
- # remove any byclass_scorers that are not in the scoring list
- byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
-
- # check if remaining scorers are valid sklearn metrics
- for i in scores.keys():
- try:
- list(scoring_methods.keys()).index(i)
- except:
- gscript.fatal(('Scoring ', i, ' is not a valid scoring method',
- os.linesep(),
- 'Valid methods are: ', scoring_methods.keys()))
-
- # set averaging type for global binary or multiclass scores
- if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
- average = 'binary'
- else:
- average = 'macro'
-
- # create np array to store feature importance scores
- if feature_importances is True:
- fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
- fimp[:] = np.nan
- else:
- fimp = None
-
- # -------------------------------------------------------------------------
- # extract cross-validation indices
- # -------------------------------------------------------------------------
-
- if groups is None:
- k_fold = cv.split(X, y)
- else:
- k_fold = cv.split(X, y, groups=groups)
-
- trains, tests = [], []
- for train_indices, test_indices in k_fold:
- trains.append(train_indices)
- tests.append(test_indices)
-
- # -------------------------------------------------------------------------
- # Perform multiprocessing fitting of clf on each fold
- # -------------------------------------------------------------------------
-
- clf_resamples = Parallel(n_jobs=n_jobs)(
- delayed(__parallel_fit)(clf, X, y, groups, train_indices,
- test_indices, sample_weight)
- for train_indices, test_indices in zip(trains, tests))
-
- # -------------------------------------------------------------------------
- # loop through each fold and calculate performance metrics
- # -------------------------------------------------------------------------
-
- # store predictions and indices
- predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
-
- fold = 0
- for train_indices, test_indices in zip(trains, tests):
-
- # create training and test folds
- X_train, X_test = X[train_indices], X[test_indices]
- y_train, y_test = y[train_indices], y[test_indices]
-
- # prediction of test fold
- y_pred = clf_resamples[fold].predict(X_test)
- predictions[test_indices, 0] = y_test
- predictions[test_indices, 1] = y_pred
- predictions[test_indices, 2] = fold
-
- # calculate global performance metrics
- for m in scores.keys():
- # metrics that require probabilties
- if m == 'brier_loss' or m == 'roc_auc':
- y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_prob))
-
- # metrics that have no averaging for multiclass
- elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
- or m == 'hamming_loss' or m == 'jaccard_similarity' \
- or m == 'log_loss' or m == 'zero_one_loss' \
- or m == 'matthews_corrcoef' or m == 'r2' \
- or m == 'neg_mean_squared_error':
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_pred))
-
- # balanced accuracy
- elif m == 'balanced_accuracy':
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average='macro'))
-
- # metrics that have averaging for multiclass
- else:
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average=average))
-
- # calculate per-class performance metrics
- for key in byclass_scores.keys():
- byclass_scores[key] = np.vstack((
- byclass_scores[key], byclass_methods[key](
- y_test, y_pred, labels=labels, average=None)))
-
- # feature importances using permutation
- if feature_importances is True:
- fimp[fold, :] = varimp_permutation(
- clf_resamples[fold], X_test, y_test, n_permutations,
- scoring_methods[scoring[0]], n_jobs, random_state)
- fold += 1
-
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+"""
+The module rlearn_crossval contains functions to perform
+model validation and permutation feature importances.
+
+"""
+
+from __future__ import absolute_import
+from copy import deepcopy
+
+import numpy as np
+import os
+from numpy.random import RandomState
+
+import grass.script as gscript
+
+def specificity_score(y_true, y_pred):
+ """
+ Calculate specificity score
+
+ Args
+ ----
+ y_true: 1D numpy array of truth values
+ y_pred: 1D numpy array of predicted classes
+
+ Returns
+ -------
+ specificity: specificity score
+ """
+ from sklearn.metrics import confusion_matrix
+
+ cm = confusion_matrix(y_true, y_pred)
+ tn = float(cm[0][0])
+ fp = float(cm[0][1])
+
+ return tn/(tn+fp)
+
+
+def varimp_permutation(estimator, X, y, n_permutations, scorer,
+ n_jobs, random_state):
+ """
+ Method to perform permutation-based feature importance during
+ cross-validation (cross-validation is applied externally to this
+ method)
+
+ Procedure is:
+ 1. Pass fitted estimator and test partition X y
+ 2. Assess AUC on the test partition (bestauc)
+ 3. Permute each variable and assess the difference between bestauc and
+ the messed-up variable
+ 4. Repeat (3) for many random permutations
+ 5. Average the repeats
+
+ Args
+ ----
+ estimator: estimator that has been fitted to a training partition
+ X, y: data and labels from a test partition
+ n_permutations: number of random permutations to apply
+ scorer: scikit-learn metric function to use
+ n_jobs: integer, number of processing cores
+ random_state: seed to pass to the numpy random.seed
+
+ Returns
+ -------
+ scores: scores for each predictor following permutation
+ """
+
+ from sklearn.externals.joblib import Parallel, delayed
+
+ # calculate score on original variables without permutation
+ # determine best metric type for binary/multiclass/regression scenarios
+ y_pred = estimator.predict(X)
+ best_score = scorer(y, y_pred)
+
+ # repeated permutations and return difference from best score per predictor
+ scores = Parallel(n_jobs=n_jobs)(
+ delayed(__permute)(
+ estimator, X, y, best_score, scorer, random_state)
+ for n in range(n_permutations))
+
+ # average the repetitions
+ scores = np.asarray(scores)
+ scores = scores.mean(axis=0)
+
+ return scores
+
+
+def __permute(estimator, X, y, best_score, scorer, random_state):
+ """
+ Permute each predictor and measure difference from best score
+
+ Args
+ ----
+ estimator: scikit learn estimator
+ X, y: data and labels from a test partition
+ best_score: best scorer obtained on unperturbed data
+ scorer: scoring method to use to measure importances
+ random_state: random seed
+
+ Returns
+ -------
+ scores: 2D numpy array of scores for each predictor following permutation
+ """
+
+ rstate = RandomState(random_state)
+
+ # permute each predictor variable and assess difference in score
+ scores = np.zeros(X.shape[1])
+
+ for i in range(X.shape[1]):
+ Xscram = np.copy(X)
+ Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])
+
+ # fit the model on the training data and predict the test data
+ y_pred = estimator.predict(Xscram)
+ scores[i] = best_score-scorer(y, y_pred)
+ if scores[i] < 0:
+ scores[i] = 0
+
+ return scores
+
+
+def __parallel_fit(estimator, X, y, groups, train_indices, test_indices, sample_weight):
+ """
+ Fit classifiers/regressors in parallel
+
+ Args
+ ----
+ estimator: scikit learn estimator
+ X, y: 2D and 1D numpy arrays of training data and labels
+ groups: 1D numpy array of len(y) containing group labels
+ train_indices, test_indices: 1D numpy arrays of indices to use for training/validation
+ sample_weight: 1D numpy array of len(y) containing weights to use during fitting
+ applied only to XGBoost and Gradient Boosting classifiers
+ """
+
+ # create training and test folds
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+ if groups is not None: groups_train = groups[train_indices]
+ else: groups_train = None
+
+ # subset training and test fold sample_weight
+ if sample_weight is not None: weights = sample_weight[train_indices]
+
+ # train estimator
+ if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
+ if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
+ else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
+ else:
+ if sample_weight is None: estimator.fit(X_train, y_train)
+ else: estimator.fit(X_train, y_train, sample_weight=weights)
+
+ return estimator
+
+
+def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
+ scoring='accuracy', feature_importances=False,
+ n_permutations=25, random_state=None, n_jobs=-1):
+ """
+ Stratified Kfold and GroupFold cross-validation using multiple
+ scoring metrics and permutation feature importances
+
+ Args
+ ----
+ estimator: Scikit learn estimator
+ X, y: 2D and 1D numpy array of training data and labels
+ groups: 1D numpy array containing group labels
+ sample_weight: 1D numpy array[n_samples,] of sample weights
+ cv: Integer of cross-validation folds or sklearn.model_selection object
+ scoring: List of performance metrics to use
+ feature_importances: Boolean to perform permutation-based importances
+ n_permutations: Number of permutations during feature importance
+ random_state: Seed to pass to the random number generator
+
+ Returns
+ -------
+ scores: Dict, containing lists of scores per cross-validation fold
+ byclass_scores: Dict, containing scores per class
+ fimp: 2D numpy array of permutation feature importances per feature
+ clf_resamples: List, fitted estimators
+ predictions: 2D numpy array with y_true, y_pred, fold
+ """
+
+ from sklearn import metrics
+ from sklearn.model_selection import StratifiedKFold
+ from sklearn.externals.joblib import Parallel, delayed
+
+ # -------------------------------------------------------------------------
+ # create copies of estimator and create cross-validation iterator
+ # -------------------------------------------------------------------------
+
+ # deepcopy estimator
+ clf = deepcopy(estimator)
+
+ # create model_selection method
+ if isinstance(cv, int):
+ cv = StratifiedKFold(n_splits=cv)
+
+ # -------------------------------------------------------------------------
+ # create dictionary of lists to store metrics
+ # -------------------------------------------------------------------------
+
+ if isinstance(scoring, basestring):
+ scoring = [scoring]
+ scores = dict.fromkeys(scoring)
+ scores = {key: [] for key, value in scores.iteritems()}
+ scoring_methods = {'accuracy': metrics.accuracy_score,
+ 'balanced_accuracy': metrics.recall_score,
+ 'average_precision': metrics.average_precision_score,
+ 'brier_loss': metrics.brier_score_loss,
+ 'kappa': metrics.cohen_kappa_score,
+ 'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'hamming_loss': metrics.hamming_loss,
+ 'jaccard_similarity': metrics.jaccard_similarity_score,
+ 'log_loss': metrics.log_loss,
+ 'matthews_corrcoef': metrics.matthews_corrcoef,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score,
+ 'specificity': specificity_score,
+ 'roc_auc': metrics.roc_auc_score,
+ 'zero_one_loss': metrics.zero_one_loss,
+ 'r2': metrics.r2_score,
+ 'neg_mean_squared_error': metrics.mean_squared_error}
+
+ byclass_methods = {'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score}
+
+ # create dict to store byclass metrics results
+ n_classes = len(np.unique(y))
+ labels = np.unique(y)
+ byclass_scores = dict.fromkeys(byclass_methods)
+ byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
+
+ # remove any byclass_scorers that are not in the scoring list
+ byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
+
+ # check if remaining scorers are valid sklearn metrics
+ for i in scores.keys():
+ try:
+ list(scoring_methods.keys()).index(i)
+ except:
+ gscript.fatal(('Scoring ', i, ' is not a valid scoring method',
+ os.linesep(),
+ 'Valid methods are: ', scoring_methods.keys()))
+
+ # set averaging type for global binary or multiclass scores
+ if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+ average = 'binary'
+ else:
+ average = 'macro'
+
+ # create np array to store feature importance scores
+ if feature_importances is True:
+ fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
+ fimp[:] = np.nan
+ else:
+ fimp = None
+
+ # -------------------------------------------------------------------------
+ # extract cross-validation indices
+ # -------------------------------------------------------------------------
+
+ if groups is None:
+ k_fold = cv.split(X, y)
+ else:
+ k_fold = cv.split(X, y, groups=groups)
+
+ trains, tests = [], []
+ for train_indices, test_indices in k_fold:
+ trains.append(train_indices)
+ tests.append(test_indices)
+
+ # -------------------------------------------------------------------------
+ # Perform multiprocessing fitting of clf on each fold
+ # -------------------------------------------------------------------------
+
+ clf_resamples = Parallel(n_jobs=n_jobs)(
+ delayed(__parallel_fit)(clf, X, y, groups, train_indices,
+ test_indices, sample_weight)
+ for train_indices, test_indices in zip(trains, tests))
+
+ # -------------------------------------------------------------------------
+ # loop through each fold and calculate performance metrics
+ # -------------------------------------------------------------------------
+
+ # store predictions and indices
+ predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
+
+ fold = 0
+ for train_indices, test_indices in zip(trains, tests):
+
+ # create training and test folds
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+
+ # prediction of test fold
+ y_pred = clf_resamples[fold].predict(X_test)
+ predictions[test_indices, 0] = y_test
+ predictions[test_indices, 1] = y_pred
+ predictions[test_indices, 2] = fold
+
+ # calculate global performance metrics
+ for m in scores.keys():
+ # metrics that require probabilties
+ if m == 'brier_loss' or m == 'roc_auc':
+ y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_prob))
+
+ # metrics that have no averaging for multiclass
+ elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
+ or m == 'hamming_loss' or m == 'jaccard_similarity' \
+ or m == 'log_loss' or m == 'zero_one_loss' \
+ or m == 'matthews_corrcoef' or m == 'r2' \
+ or m == 'neg_mean_squared_error':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_pred))
+
+ # balanced accuracy
+ elif m == 'balanced_accuracy':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average='macro'))
+
+ # metrics that have averaging for multiclass
+ else:
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average=average))
+
+ # calculate per-class performance metrics
+ for key in byclass_scores.keys():
+ byclass_scores[key] = np.vstack((
+ byclass_scores[key], byclass_methods[key](
+ y_test, y_pred, labels=labels, average=None)))
+
+ # feature importances using permutation
+ if feature_importances is True:
+ fimp[fold, :] = varimp_permutation(
+ clf_resamples[fold], X_test, y_test, n_permutations,
+ scoring_methods[scoring[0]], n_jobs, random_state)
+ fold += 1
+
return(scores, byclass_scores, fimp, clf_resamples, predictions)
\ No newline at end of file
Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py 2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py 2017-06-01 22:03:12 UTC (rev 71157)
@@ -1,479 +1,486 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_rasters contains functions to
-extract training data from GRASS rasters.
-
-"""
-
-
-from __future__ import absolute_import
-
-import numpy as np
-import tempfile
-
-import grass.script as gscript
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
-from grass.pygrass.vector import VectorTopo
-from grass.pygrass.utils import get_raster_for_points, pixel2coor
-
-
-def extract(response, predictors, lowmem=False):
- """
- Samples a list of GRASS rasters using a labelled raster
- Per raster sampling
-
- Args
- ----
- response: String; GRASS raster with labelled pixels
- predictors: List of GRASS rasters containing explanatory variables
- lowmem: Boolean, use numpy memmap to query predictors
-
- Returns
- -------
- training_data: Numpy array of extracted raster values
- training_labels: Numpy array of labels
- is_train: Row and Columns of label positions
- """
-
- current = Region()
-
- # open response raster as rasterrow and read as np array
- if RasterRow(response).exist() is True:
- roi_gr = RasterRow(response)
- roi_gr.open('r')
-
- if lowmem is False:
- response_np = np.array(roi_gr)
- else:
- response_np = np.memmap(
- tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
- response_np[:] = np.array(roi_gr)[:]
- else:
- gscript.fatal("GRASS response raster does not exist.... exiting")
-
- # determine number of predictor rasters
- n_features = len(predictors)
-
- # check to see if all predictors exist
- for i in range(n_features):
- if RasterRow(predictors[i]).exist() is not True:
- gscript.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan
- is_train = np.nonzero(response_np > -2147483648)
- training_labels = response_np[is_train]
- n_labels = np.array(is_train).shape[1]
-
- # Create a zero numpy array of len training labels
- if lowmem is False:
- training_data = np.zeros((n_labels, n_features))
- else:
- training_data = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(n_labels, n_features))
-
- # Loop through each raster and sample pixel values at training indexes
- if lowmem is True:
- feature_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
-
- for f in range(n_features):
- predictor_gr = RasterRow(predictors[f])
- predictor_gr.open('r')
-
- if lowmem is False:
- feature_np = np.array(predictor_gr)
- else:
- feature_np[:] = np.array(predictor_gr)[:]
-
- training_data[0:n_labels, f] = feature_np[is_train]
-
- # close each predictor map
- predictor_gr.close()
-
- # convert any CELL maps no datavals to NaN in the training data
- for i in range(n_features):
- training_data[training_data[:, i] == -2147483648] = np.nan
-
- # convert indexes of training pixels from tuple to n*2 np array
- is_train = np.array(is_train).T
- for i in range(is_train.shape[0]):
- is_train[i, :] = np.array(pixel2coor(tuple(is_train[i]), current))
-
- # close the response map
- roi_gr.close()
-
- return(training_data, training_labels, is_train)
-
-
-def extract_points(gvector, grasters, field):
- """
- Extract values from grass rasters using vector points input
-
- Args
- ----
- gvector: character, name of grass points vector
- grasters: list of names of grass raster to query
- field: character, name of field in table to use as response variable
-
- Returns
- -------
- X: 2D numpy array of training data
- y: 1D numpy array with the response variable
- coordinates: 2D numpy array of sample coordinates
- """
- # open grass vector
- points = VectorTopo(gvector.split('@')[0])
- points.open('r')
-
- # create link to attribute table
- points.dblinks.by_name(name=gvector)
-
- # extract table field to numpy array
- table = points.table
- cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
- y = np.array([np.isnan if c is None else c[0] for c in cur])
-
- # extract raster data
- X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
- for i, raster in enumerate(grasters):
- rio = RasterRow(raster)
- values = np.asarray(get_raster_for_points(points, rio))
- coordinates = values[:, 1:3]
- X[:, i] = values[:, 3]
-
- # set any grass integer nodata values to NaN
- X[X == -2147483648] = np.nan
-
- # remove missing response data
- X = X[~np.isnan(y)]
- coordinates = coordinates[~np.isnan(y)]
- y = y[~np.isnan(y)]
-
- # close
- points.close()
-
- return(X, y, coordinates)
-
-
-def predict(estimator, predictors, output, predict_type='raw',
- index=None, rowincr=25, n_jobs=-2):
- """
- Prediction on list of GRASS rasters using a fitted scikit learn model
-
- Args
- ----
- estimator: scikit-learn estimator object
- predictors: list of GRASS rasters
- output: Name of GRASS raster to output classification results
- predict_type: character, 'raw' for classification/regression;
- 'prob' for class probabilities
- index: Optional, list of class indices to export
- rowincr: Integer of raster rows to process at one time
- n_jobs: Number of processing cores
- """
-
- from sklearn.externals.joblib import Parallel, delayed
-
- # convert potential single index to list
- if isinstance(index, int): index = [index]
-
- # open predictors as list of rasterrow objects
- current = Region()
- n_features = len(predictors)
- rasstack = [0] * n_features
-
- for i in range(n_features):
- rasstack[i] = RasterRow(predictors[i])
- if rasstack[i].exist() is True:
- rasstack[i].open('r')
- else:
- gscript.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # -------------------------------------------------------------------------
- # turn off multiprocessing for multi-threaded classifiers
- # -------------------------------------------------------------------------
-
- # first unwrap the estimator from any potential pipelines or gridsearchCV
- if type(estimator).__name__ == 'Pipeline':
- clf_type = estimator.named_steps['classifier']
- else:
- clf_type = estimator
-
- if type(clf_type).__name__ == 'GridSearchCV' or \
- type(clf_type).__name__ == 'RandomizedSearchCV':
- clf_type = clf_type.best_estimator_
-
- # check name against already multithreaded classifiers
- if n_jobs == 1 or type(clf_type).__name__ in ['RandomForestClassifier',
- 'RandomForestRegressor',
- 'ExtraTreesClassifier',
- 'ExtraTreesRegressor',
- 'KNeighborsClassifier',
- 'XGBClassifier',
- 'XGBRegressor']:
- # ---------------------------------------------------------------------
- # sequential prediction
- # ---------------------------------------------------------------------
-
- # Prediction using blocks of rows per iteration
- for rowblock in range(0, current.rows, rowincr):
- gscript.percent(rowblock, current.rows, rowincr)
-
- # check that the row increment does not exceed the number of rows
- if rowblock+rowincr > current.rows:
- rowincr = current.rows - rowblock
- img_np_row = np.zeros((rowincr, current.cols, n_features))
-
- # loop through each row, and each band and add to 2D img_np_row
- for row in range(rowblock, rowblock+rowincr, 1):
- for band in range(n_features):
- img_np_row[row-rowblock, :, band] = \
- np.array(rasstack[band][row])
-
- # create mask
- img_np_row[img_np_row == -2147483648] = np.nan
- mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
- for feature in range(n_features):
- invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
- mask[invalid_indexes] = np.nan
-
- # reshape each row-band matrix into a n*m array
- nsamples = rowincr * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
-
- # remove NaNs prior to passing to scikit-learn predict
- flat_pixels = np.nan_to_num(flat_pixels)
-
- # perform prediction for classification/regression
- if predict_type == 'raw':
- result = estimator.predict(flat_pixels)
- result = result.reshape((rowincr, current.cols))
-
- # determine nodata value and grass raster type
- if result.dtype == 'float':
- nodata = np.nan
- ftype = 'FCELL'
- else:
- nodata = -2147483648
- ftype = 'CELL'
-
- # replace NaN values so that the prediction does not have a border
- result[np.nonzero(np.isnan(mask))] = nodata
-
- # on first iteration create the RasterRow object
- if rowblock == 0:
- if predict_type == 'raw':
- classification = RasterRow(output)
- classification.open('w', ftype, overwrite=True)
-
- # write the classification result
- for row in range(rowincr):
- newrow = Buffer((result.shape[1],), mtype=ftype)
- newrow[:] = result[row, :]
- classification.put_row(newrow)
-
- # perform prediction for class probabilities
- if predict_type == 'prob':
- result_proba = estimator.predict_proba(flat_pixels)
-
- # on first loop determine number of probability classes
- # and open rasterrow objects for writing
- if rowblock == 0:
- if index is None:
- index = range(result_proba.shape[1])
- n_classes = len(index)
- else:
- n_classes = len(np.unique(index))
-
- # create and open RasterRow objects for probabilities
- prob_out_raster = [0] * n_classes
- prob = [0] * n_classes
- for iclass, label in enumerate(index):
- prob_out_raster[iclass] = output + '_classPr' + str(label)
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
-
- for iclass, label in enumerate(index):
- result_proba_class = result_proba[:, label]
- result_proba_class = result_proba_class.reshape((rowincr, current.cols))
- result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
-
- for row in range(rowincr):
- newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
- else:
-
- # ---------------------------------------------------------------------
- # parallel prediction
- # ---------------------------------------------------------------------
-
- # create lists of row increments
- row_mins, row_maxs = [], []
- for row in range(0, current.rows, rowincr):
- if row+rowincr > current.rows:
- rowincr = current.rows - row
- row_mins.append(row)
- row_maxs.append(row+rowincr)
-
- # perform predictions on lists of row increments in parallel
- predictions = Parallel(n_jobs=n_jobs, max_nbytes=None)(
- delayed(__predict_parallel)
- (estimator, predictors, predict_type, current, row_min, row_max)
- for row_min, row_max in zip(row_mins, row_maxs))
-
- # unpack the results
- results = []
- ftypes = []
- for block in predictions:
- results.append(block[0])
- ftypes.append(block[1])
-
- # -------------------------------------------------------------------------
- # writing of predicted results for classification
- # -------------------------------------------------------------------------
- if predict_type == 'raw':
- classification = RasterRow(output)
- classification.open('w', ftypes[0], overwrite=True)
-
- # write the classification result
- for result_block in results:
- for row in range(result_block.shape[0]):
- newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
- newrow[:] = result_block[row, :]
- classification.put_row(newrow)
-
- # -------------------------------------------------------------------------
- # writing of predicted results for probabilities
- # -------------------------------------------------------------------------
- if predict_type == 'prob':
- # determine number of classes
- if index is None:
- index = range(results[0].shape[2])
- n_classes = len(index)
- else:
- n_classes = len(np.unique(index))
-
- # create and open RasterRow objects for probabilities
- prob_out_raster = [0] * n_classes
- prob = [0] * n_classes
- for iclass, label in enumerate(index):
- prob_out_raster[iclass] = output + '_classPr' + str(label)
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
-
- # write the class probability results
- for results_proba_block in results:
- for iclass, label in enumerate(index):
- result_proba_class = results_proba_block[:, :, label]
-
- for row in range(result_proba_class.shape[0]):
- newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
-
- # -------------------------------------------------------------------------
- # close all maps
- # -------------------------------------------------------------------------
- if predict_type == 'raw': classification.close()
- if predict_type == 'prob':
- try:
- for iclass in range(n_classes):
- prob[iclass].close()
- except:
- pass
-
-
-def __predict_parallel(estimator, predictors, predict_type, current, row_min, row_max):
- """
- Performs prediction on range of rows in grass rasters
-
- Args
- ----
- estimator: scikit-learn estimator object
- predictors: list of GRASS rasters
- predict_type: character, 'raw' for classification/regression;
- 'prob' for class probabilities
- current: current region settings
- row_min, row_max: Range of rows of grass rasters to perform predictions
-
- Returns
- -------
- result: 2D (classification) or 3D numpy array (class probabilities) of predictions
- ftypes: data storage type
- """
-
- # initialize output
- result, ftype, mask = None, None, None
-
- # open grass rasters
- n_features = len(predictors)
- rasstack = [0] * n_features
-
- for i in range(n_features):
- rasstack[i] = RasterRow(predictors[i])
- if rasstack[i].exist() is True:
- rasstack[i].open('r')
- else:
- gscript.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # loop through each row, and each band and add to 2D img_np_row
- img_np_row = np.zeros((row_max-row_min, current.cols, n_features))
- for row in range(row_min, row_max):
- for band in range(n_features):
- img_np_row[row-row_min, :, band] = np.array(rasstack[band][row])
-
- # create mask
- img_np_row[img_np_row == -2147483648] = np.nan
- mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
- for feature in range(n_features):
- invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
- mask[invalid_indexes] = np.nan
-
- # reshape each row-band matrix into a n*m array
- nsamples = (row_max-row_min) * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
-
- # remove NaNs prior to passing to scikit-learn predict
- flat_pixels = np.nan_to_num(flat_pixels)
-
- # perform prediction for classification/regression
- if predict_type == 'raw':
- result = estimator.predict(flat_pixels)
- result = result.reshape((row_max-row_min, current.cols))
-
- # determine nodata value and grass raster type
- if result.dtype == 'float':
- nodata = np.nan
- ftype = 'FCELL'
- else:
- nodata = -2147483648
- ftype = 'CELL'
-
- # replace NaN values so that the prediction does not have a border
- result[np.nonzero(np.isnan(mask))] = nodata
-
- # perform prediction for class probabilities
- if predict_type == 'prob':
- result = estimator.predict_proba(flat_pixels)
- result = result.reshape((row_max-row_min, current.cols, result.shape[1]))
- result[np.nonzero(np.isnan(mask))] = np.nan
-
- # close maps
- for i in range(n_features):
- rasstack[i].close()
-
- return (result, ftype)
-
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+"""
+The module rlearn_rasters contains functions to
+extract training data from GRASS rasters.
+
+"""
+
+
+from __future__ import absolute_import
+
+import numpy as np
+import tempfile
+
+import grass.script as gscript
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.vector import VectorTopo
+from grass.pygrass.utils import get_raster_for_points, pixel2coor
+
+
+def extract(response, predictors, lowmem=False):
+ """
+ Samples a list of GRASS rasters using a labelled raster
+ Per raster sampling
+
+ Args
+ ----
+ response: String; GRASS raster with labelled pixels
+ predictors: List of GRASS rasters containing explanatory variables
+ lowmem: Boolean, use numpy memmap to query predictors
+
+ Returns
+ -------
+ training_data: Numpy array of extracted raster values
+ training_labels: Numpy array of labels
+ is_train: Row and Columns of label positions
+ """
+
+ current = Region()
+
+ # open response raster as rasterrow and read as np array
+ if RasterRow(response).exist() is True:
+ roi_gr = RasterRow(response)
+ roi_gr.open('r')
+
+ if lowmem is False:
+ response_np = np.array(roi_gr)
+ else:
+ response_np = np.memmap(
+ tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+ response_np[:] = np.array(roi_gr)[:]
+ else:
+ gscript.fatal("GRASS response raster does not exist.... exiting")
+
+ # determine number of predictor rasters
+ n_features = len(predictors)
+
+ # check to see if all predictors exist
+ for i in range(n_features):
+ if RasterRow(predictors[i]).exist() is not True:
+ gscript.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # check if any of those pixels are labelled (not equal to nodata)
+ # can use even if roi is FCELL because nodata will be nan
+ is_train = np.nonzero(response_np > -2147483648)
+ training_labels = response_np[is_train]
+ n_labels = np.array(is_train).shape[1]
+
+ # Create a zero numpy array of len training labels
+ if lowmem is False:
+ training_data = np.zeros((n_labels, n_features))
+ else:
+ training_data = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(n_labels, n_features))
+
+ # Loop through each raster and sample pixel values at training indexes
+ if lowmem is True:
+ feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+
+ for f in range(n_features):
+ predictor_gr = RasterRow(predictors[f])
+ predictor_gr.open('r')
+
+ if lowmem is False:
+ feature_np = np.array(predictor_gr)
+ else:
+ feature_np[:] = np.array(predictor_gr)[:]
+
+ training_data[0:n_labels, f] = feature_np[is_train]
+
+ # close each predictor map
+ predictor_gr.close()
+
+ # convert any CELL maps no datavals to NaN in the training data
+ for i in range(n_features):
+ training_data[training_data[:, i] == -2147483648] = np.nan
+
+ # convert indexes of training pixels from tuple to n*2 np array
+ is_train = np.array(is_train).T
+ for i in range(is_train.shape[0]):
+ is_train[i, :] = np.array(pixel2coor(tuple(is_train[i]), current))
+
+ # close the response map
+ roi_gr.close()
+
+ return(training_data, training_labels, is_train)
+
+
+def extract_points(gvector, grasters, field):
+ """
+ Extract values from grass rasters using vector points input
+
+ Args
+ ----
+ gvector: character, name of grass points vector
+ grasters: list of names of grass raster to query
+ field: character, name of field in table to use as response variable
+
+ Returns
+ -------
+ X: 2D numpy array of training data
+ y: 1D numpy array with the response variable
+ coordinates: 2D numpy array of sample coordinates
+ """
+ # open grass vector
+ points = VectorTopo(gvector.split('@')[0])
+ points.open('r')
+
+ # create link to attribute table
+ points.dblinks.by_name(name=gvector)
+
+ # extract table field to numpy array
+ table = points.table
+ cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
+ y = np.array([np.isnan if c is None else c[0] for c in cur])
+
+ # extract raster data
+ X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
+ for i, raster in enumerate(grasters):
+ rio = RasterRow(raster)
+ values = np.asarray(get_raster_for_points(points, rio))
+ coordinates = values[:, 1:3]
+ X[:, i] = values[:, 3]
+ rio.close()
+
+ # set any grass integer nodata values to NaN
+ X[X == -2147483648] = np.nan
+
+ # remove missing response data
+ X = X[~np.isnan(y)]
+ coordinates = coordinates[~np.isnan(y)]
+ y = y[~np.isnan(y)]
+
+ # close
+ points.close()
+
+ return(X, y, coordinates)
+
+
+def predict(estimator, predictors, output, predict_type='raw',
+ index=None, rowincr=25, n_jobs=-2):
+ """
+ Prediction on list of GRASS rasters using a fitted scikit learn model
+
+ Args
+ ----
+ estimator: scikit-learn estimator object
+ predictors: list of GRASS rasters
+ output: Name of GRASS raster to output classification results
+ predict_type: character, 'raw' for classification/regression;
+ 'prob' for class probabilities
+ index: Optional, list of class indices to export
+ rowincr: Integer of raster rows to process at one time
+ n_jobs: Number of processing cores
+ """
+
+ from sklearn.externals.joblib import Parallel, delayed
+
+ # convert potential single index to list
+ if isinstance(index, int): index = [index]
+
+ # open predictors as list of rasterrow objects
+ current = Region()
+ n_features = len(predictors)
+
+ # -------------------------------------------------------------------------
+ # turn off multiprocessing for multi-threaded classifiers
+ # -------------------------------------------------------------------------
+
+ # first unwrap the estimator from any potential pipelines or gridsearchCV
+ if type(estimator).__name__ == 'Pipeline':
+ clf_type = estimator.named_steps['classifier']
+ else:
+ clf_type = estimator
+
+ if type(clf_type).__name__ == 'GridSearchCV' or \
+ type(clf_type).__name__ == 'RandomizedSearchCV':
+ clf_type = clf_type.best_estimator_
+
+ # check name against already multithreaded classifiers
+ if n_jobs == 1 or type(clf_type).__name__ in ['RandomForestClassifier',
+ 'RandomForestRegressor',
+ 'ExtraTreesClassifier',
+ 'ExtraTreesRegressor',
+ 'KNeighborsClassifier',
+ 'XGBClassifier',
+ 'XGBRegressor']:
+ # ---------------------------------------------------------------------
+ # sequential prediction
+ # ---------------------------------------------------------------------
+
+ rasstack = [0] * n_features
+
+ for i in range(n_features):
+ rasstack[i] = RasterRow(predictors[i])
+ if rasstack[i].exist() is True:
+ rasstack[i].open('r')
+ else:
+ gscript.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # Prediction using blocks of rows per iteration
+ for rowblock in range(0, current.rows, rowincr):
+ gscript.percent(rowblock, current.rows, rowincr)
+
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
+
+ # loop through each row, and each band and add to 2D img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
+
+ # create mask
+ img_np_row[img_np_row == -2147483648] = np.nan
+ mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+ for feature in range(n_features):
+ invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+ mask[invalid_indexes] = np.nan
+
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaNs prior to passing to scikit-learn predict
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # perform prediction for classification/regression
+ if predict_type == 'raw':
+ result = estimator.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
+
+ # determine nodata value and grass raster type
+ if result.dtype == 'float':
+ nodata = np.nan
+ ftype = 'FCELL'
+ else:
+ nodata = -2147483648
+ ftype = 'CELL'
+
+ # replace NaN values so that the prediction does not have a border
+ result[np.nonzero(np.isnan(mask))] = nodata
+
+ # on first iteration create the RasterRow object
+ if rowblock == 0:
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftype, overwrite=True)
+
+ # write the classification result
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
+
+ # perform prediction for class probabilities
+ if predict_type == 'prob':
+ result_proba = estimator.predict_proba(flat_pixels)
+
+ # on first loop determine number of probability classes
+ # and open rasterrow objects for writing
+ if rowblock == 0:
+ if index is None:
+ index = range(result_proba.shape[1])
+ n_classes = len(index)
+ else:
+ n_classes = len(np.unique(index))
+
+ # create and open RasterRow objects for probabilities
+ prob_out_raster = [0] * n_classes
+ prob = [0] * n_classes
+ for iclass, label in enumerate(index):
+ prob_out_raster[iclass] = output + '_classPr' + str(label)
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ for iclass, label in enumerate(index):
+ result_proba_class = result_proba[:, label]
+ result_proba_class = result_proba_class.reshape((rowincr, current.cols))
+ result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
+
+ for row in range(rowincr):
+ newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
+ # close maps
+ for i in range(n_features):
+ rasstack[i].close()
+
+ else:
+
+ # ---------------------------------------------------------------------
+ # parallel prediction
+ # ---------------------------------------------------------------------
+
+ # create lists of row increments
+ row_mins, row_maxs = [], []
+ for row in range(0, current.rows, rowincr):
+ if row+rowincr > current.rows:
+ rowincr = current.rows - row
+ row_mins.append(row)
+ row_maxs.append(row+rowincr)
+
+ # perform predictions on lists of row increments in parallel
+ predictions = Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ delayed(__predict_parallel)
+ (estimator, predictors, predict_type, current, row_min, row_max)
+ for row_min, row_max in zip(row_mins, row_maxs))
+
+ # unpack the results
+ results = []
+ ftypes = []
+ for block in predictions:
+ results.append(block[0])
+ ftypes.append(block[1])
+
+ # -------------------------------------------------------------------------
+ # writing of predicted results for classification
+ # -------------------------------------------------------------------------
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftypes[0], overwrite=True)
+
+ # write the classification result
+ for result_block in results:
+ for row in range(result_block.shape[0]):
+ newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
+ newrow[:] = result_block[row, :]
+ classification.put_row(newrow)
+
+ # -------------------------------------------------------------------------
+ # writing of predicted results for probabilities
+ # -------------------------------------------------------------------------
+ if predict_type == 'prob':
+ # determine number of classes
+ if index is None:
+ index = range(results[0].shape[2])
+ n_classes = len(index)
+ else:
+ n_classes = len(np.unique(index))
+
+ # create and open RasterRow objects for probabilities
+ prob_out_raster = [0] * n_classes
+ prob = [0] * n_classes
+ for iclass, label in enumerate(index):
+ prob_out_raster[iclass] = output + '_classPr' + str(label)
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ # write the class probability results
+ for results_proba_block in results:
+ for iclass, label in enumerate(index):
+ result_proba_class = results_proba_block[:, :, label]
+
+ for row in range(result_proba_class.shape[0]):
+ newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
+ # -------------------------------------------------------------------------
+ # close all maps
+ # -------------------------------------------------------------------------
+ if predict_type == 'raw': classification.close()
+ if predict_type == 'prob':
+ try:
+ for iclass in range(n_classes):
+ prob[iclass].close()
+ except:
+ pass
+
+
+def __predict_parallel(estimator, predictors, predict_type, current, row_min, row_max):
+ """
+ Performs prediction on range of rows in grass rasters
+
+ Args
+ ----
+ estimator: scikit-learn estimator object
+ predictors: list of GRASS rasters
+ predict_type: character, 'raw' for classification/regression;
+ 'prob' for class probabilities
+ current: current region settings
+ row_min, row_max: Range of rows of grass rasters to perform predictions
+
+ Returns
+ -------
+ result: 2D (classification) or 3D numpy array (class probabilities) of predictions
+ ftypes: data storage type
+ """
+
+ # initialize output
+ result, ftype, mask = None, None, None
+
+ # open grass rasters
+ n_features = len(predictors)
+ rasstack = [0] * n_features
+
+ for i in range(n_features):
+ rasstack[i] = RasterRow(predictors[i])
+ if rasstack[i].exist() is True:
+ rasstack[i].open('r')
+ else:
+ gscript.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # loop through each row, and each band and add to 2D img_np_row
+ img_np_row = np.zeros((row_max-row_min, current.cols, n_features))
+ for row in range(row_min, row_max):
+ for band in range(n_features):
+ img_np_row[row-row_min, :, band] = np.array(rasstack[band][row])
+
+ # create mask
+ img_np_row[img_np_row == -2147483648] = np.nan
+ mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+ for feature in range(n_features):
+ invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+ mask[invalid_indexes] = np.nan
+
+ # reshape each row-band matrix into a n*m array
+ nsamples = (row_max-row_min) * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaNs prior to passing to scikit-learn predict
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # perform prediction for classification/regression
+ if predict_type == 'raw':
+ result = estimator.predict(flat_pixels)
+ result = result.reshape((row_max-row_min, current.cols))
+
+ # determine nodata value and grass raster type
+ if result.dtype == 'float':
+ nodata = np.nan
+ ftype = 'FCELL'
+ else:
+ nodata = -2147483648
+ ftype = 'CELL'
+
+ # replace NaN values so that the prediction does not have a border
+ result[np.nonzero(np.isnan(mask))] = nodata
+
+ # perform prediction for class probabilities
+ if predict_type == 'prob':
+ result = estimator.predict_proba(flat_pixels)
+ result = result.reshape((row_max-row_min, current.cols, result.shape[1]))
+ result[np.nonzero(np.isnan(mask))] = np.nan
+
+ # close maps
+ for i in range(n_features):
+ rasstack[i].close()
+
+ return (result, ftype)
+
Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py 2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py 2017-06-01 22:03:12 UTC (rev 71157)
@@ -1,282 +1,282 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_utils contains functinons to assist
-with passing pre-defined scikit learn classifiers
-and other utilities for loading/saving training data.
-
-"""
-
-from __future__ import absolute_import
-from subprocess import PIPE
-
-import numpy as np
-import os
-from copy import deepcopy
-
-import grass.script as gscript
-from grass.pygrass.modules.shortcuts import imagery as im
-
-def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
- """
- Provides the classifiers and parameters using by the module
-
- Args
- ----
- estimator: Name of estimator
- random_state: Seed to use in randomized components
- n_jobs: Integer, number of processing cores to use
- p: Dict, containing classifier setttings
- weights: None, or 'balanced' to add class_weights
-
- Returns
- -------
- clf: Scikit-learn classifier object
- mode: Flag to indicate whether classifier performs classification or
- regression
- """
-
- from sklearn.linear_model import LogisticRegression
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
- from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
- from sklearn.ensemble import (
- RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
- ExtraTreesRegressor)
- from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
- from sklearn.svm import SVC
- from sklearn.neighbors import KNeighborsClassifier
-
- # convert balanced boolean to scikit learn method
- if weights is True:
- weights = 'balanced'
- else: weights = None
-
- # optional packages that add additional classifiers here
- if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
- try:
- from sklearn.pipeline import Pipeline
- from pyearth import Earth
-
- earth_classifier = Pipeline([('Earth',
- Earth(max_degree=p['max_degree'])),
- ('Logistic', LogisticRegression(n_jobs=n_jobs))])
-
- classifiers = {'EarthClassifier': earth_classifier,
- 'EarthRegressor': Earth(max_degree=p['max_degree'])}
- except:
- gscript.fatal('Py-earth package not installed')
-
- elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
- try:
- from xgboost import XGBClassifier, XGBRegressor
-
- if p['max_depth'] is None:
- p['max_depth'] = int(3)
-
- classifiers = {
- 'XGBClassifier':
- XGBClassifier(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- subsample=p['subsample'],
- nthread=n_jobs),
- 'XGBRegressor':
- XGBRegressor(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- subsample=p['subsample'],
- nthread=n_jobs)}
- except:
- gscript.fatal('XGBoost package not installed')
- else:
- # core sklearn classifiers go here
- classifiers = {
- 'SVC': SVC(C=p['C'],
- class_weight=weights,
- probability=True,
- random_state=random_state),
- 'LogisticRegression':
- LogisticRegression(C=p['C'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- fit_intercept=True),
- 'DecisionTreeClassifier':
- DecisionTreeClassifier(max_depth=p['max_depth'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state),
- 'DecisionTreeRegressor':
- DecisionTreeRegressor(max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state),
- 'RandomForestClassifier':
- RandomForestClassifier(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'RandomForestRegressor':
- RandomForestRegressor(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'ExtraTreesClassifier':
- ExtraTreesClassifier(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- class_weight=weights,
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'ExtraTreesRegressor':
- ExtraTreesRegressor(n_estimators=p['n_estimators'],
- max_features=p['max_features'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- random_state=random_state,
- n_jobs=n_jobs,
- oob_score=False),
- 'GradientBoostingClassifier':
- GradientBoostingClassifier(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- subsample=p['subsample'],
- max_features=p['max_features'],
- random_state=random_state),
- 'GradientBoostingRegressor':
- GradientBoostingRegressor(learning_rate=p['learning_rate'],
- n_estimators=p['n_estimators'],
- max_depth=p['max_depth'],
- min_samples_split=p['min_samples_split'],
- min_samples_leaf=p['min_samples_leaf'],
- subsample=p['subsample'],
- max_features=p['max_features'],
- random_state=random_state),
- 'GaussianNB': GaussianNB(),
- 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
- 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
- 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=p['n_neighbors'],
- weights=p['weights'],
- n_jobs=n_jobs)
- }
-
- # define classifier
- clf = classifiers[estimator]
-
- # classification or regression
- if estimator == 'LogisticRegression' \
- or estimator == 'DecisionTreeClassifier' \
- or estimator == 'RandomForestClassifier' \
- or estimator == 'ExtraTreesClassifier' \
- or estimator == 'GradientBoostingClassifier' \
- or estimator == 'GaussianNB' \
- or estimator == 'LinearDiscriminantAnalysis' \
- or estimator == 'QuadraticDiscriminantAnalysis' \
- or estimator == 'EarthClassifier' \
- or estimator == 'XGBClassifier' \
- or estimator == 'SVC' \
- or estimator == 'KNeighborsClassifier':
- mode = 'classification'
- else:
- mode = 'regression'
-
- return (clf, mode)
-
-
-def save_training_data(X, y, groups, coords, file):
- """
- Saves any extracted training data to a csv file
-
- Args
- ----
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels
- coords: Numpy array containing xy coordinates of samples
- file: Path to a csv file to save data to
- """
-
- # if there are no group labels, create a nan filled array
- if groups is None:
- groups = np.empty((y.shape[0]))
- groups[:] = np.nan
-
- training_data = np.column_stack([coords, X, y, groups])
- np.savetxt(file, training_data, delimiter=',')
-
-
-def load_training_data(file):
- """
- Loads training data and labels from a csv file
-
- Args
- ----
- file: Path to a csv file to save data to
-
- Returns
- -------
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels, or None
- coords: Numpy array containing x,y coordinates of samples
- """
-
- training_data = np.loadtxt(file, delimiter=',')
- n_cols = training_data.shape[1]
- last_Xcol = n_cols-2
-
- # check to see if last column contains group labels or nans
- groups = training_data[:, -1]
-
- # if all nans then set groups to None
- if bool(np.isnan(groups).all()) is True:
- groups = None
-
- # fetch X and y
- coords = training_data[:, 0:2]
- X = training_data[:, 2:last_Xcol]
- y = training_data[:, -2]
-
- return(X, y, groups, coords)
-
-
-def maps_from_group(group):
- """
- Parse individual rasters into a list from an imagery group
-
- Args
- ----
- group: String; GRASS imagery group
-
- Returns
- -------
- maplist: List containing individual GRASS raster maps
- map_names: List with print friendly map names
- """
- groupmaps = im.group(group=group, flags="g",
- quiet=True, stdout_=PIPE).outputs.stdout
-
- maplist = groupmaps.split(os.linesep)
- maplist = maplist[0:len(maplist)-1]
- map_names = []
-
- for rastername in maplist:
- map_names.append(rastername.split('@')[0])
-
- return(maplist, map_names)
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+"""
+The module rlearn_utils contains functinons to assist
+with passing pre-defined scikit learn classifiers
+and other utilities for loading/saving training data.
+
+"""
+
+from __future__ import absolute_import
+from subprocess import PIPE
+
+import numpy as np
+import os
+from copy import deepcopy
+
+import grass.script as gscript
+from grass.pygrass.modules.shortcuts import imagery as im
+
+def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
+ """
+ Provides the classifiers and parameters using by the module
+
+ Args
+ ----
+ estimator: Name of estimator
+ random_state: Seed to use in randomized components
+ n_jobs: Integer, number of processing cores to use
+ p: Dict, containing classifier setttings
+ weights: None, or 'balanced' to add class_weights
+
+ Returns
+ -------
+ clf: Scikit-learn classifier object
+ mode: Flag to indicate whether classifier performs classification or
+ regression
+ """
+
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+ from sklearn.naive_bayes import GaussianNB
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+ from sklearn.ensemble import (
+ RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
+ ExtraTreesRegressor)
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+ from sklearn.svm import SVC
+ from sklearn.neighbors import KNeighborsClassifier
+
+ # convert balanced boolean to scikit learn method
+ if weights is True:
+ weights = 'balanced'
+ else: weights = None
+
+ # optional packages that add additional classifiers here
+ if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
+ try:
+ from sklearn.pipeline import Pipeline
+ from pyearth import Earth
+
+ earth_classifier = Pipeline([('Earth',
+ Earth(max_degree=p['max_degree'])),
+ ('Logistic', LogisticRegression(n_jobs=n_jobs))])
+
+ classifiers = {'EarthClassifier': earth_classifier,
+ 'EarthRegressor': Earth(max_degree=p['max_degree'])}
+ except:
+ gscript.fatal('Py-earth package not installed')
+
+ elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
+ try:
+ from xgboost import XGBClassifier, XGBRegressor
+
+ if p['max_depth'] is None:
+ p['max_depth'] = int(3)
+
+ classifiers = {
+ 'XGBClassifier':
+ XGBClassifier(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ subsample=p['subsample'],
+ nthread=n_jobs),
+ 'XGBRegressor':
+ XGBRegressor(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ subsample=p['subsample'],
+ nthread=n_jobs)}
+ except:
+ gscript.fatal('XGBoost package not installed')
+ else:
+ # core sklearn classifiers go here
+ classifiers = {
+ 'SVC': SVC(C=p['C'],
+ class_weight=weights,
+ probability=True,
+ random_state=random_state),
+ 'LogisticRegression':
+ LogisticRegression(C=p['C'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ fit_intercept=True),
+ 'DecisionTreeClassifier':
+ DecisionTreeClassifier(max_depth=p['max_depth'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state),
+ 'DecisionTreeRegressor':
+ DecisionTreeRegressor(max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state),
+ 'RandomForestClassifier':
+ RandomForestClassifier(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'RandomForestRegressor':
+ RandomForestRegressor(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'ExtraTreesClassifier':
+ ExtraTreesClassifier(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ class_weight=weights,
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'ExtraTreesRegressor':
+ ExtraTreesRegressor(n_estimators=p['n_estimators'],
+ max_features=p['max_features'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ random_state=random_state,
+ n_jobs=n_jobs,
+ oob_score=False),
+ 'GradientBoostingClassifier':
+ GradientBoostingClassifier(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ subsample=p['subsample'],
+ max_features=p['max_features'],
+ random_state=random_state),
+ 'GradientBoostingRegressor':
+ GradientBoostingRegressor(learning_rate=p['learning_rate'],
+ n_estimators=p['n_estimators'],
+ max_depth=p['max_depth'],
+ min_samples_split=p['min_samples_split'],
+ min_samples_leaf=p['min_samples_leaf'],
+ subsample=p['subsample'],
+ max_features=p['max_features'],
+ random_state=random_state),
+ 'GaussianNB': GaussianNB(),
+ 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+ 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+ 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=p['n_neighbors'],
+ weights=p['weights'],
+ n_jobs=n_jobs)
+ }
+
+ # define classifier
+ clf = classifiers[estimator]
+
+ # classification or regression
+ if estimator == 'LogisticRegression' \
+ or estimator == 'DecisionTreeClassifier' \
+ or estimator == 'RandomForestClassifier' \
+ or estimator == 'ExtraTreesClassifier' \
+ or estimator == 'GradientBoostingClassifier' \
+ or estimator == 'GaussianNB' \
+ or estimator == 'LinearDiscriminantAnalysis' \
+ or estimator == 'QuadraticDiscriminantAnalysis' \
+ or estimator == 'EarthClassifier' \
+ or estimator == 'XGBClassifier' \
+ or estimator == 'SVC' \
+ or estimator == 'KNeighborsClassifier':
+ mode = 'classification'
+ else:
+ mode = 'regression'
+
+ return (clf, mode)
+
+
+def save_training_data(X, y, groups, coords, file):
+ """
+ Saves any extracted training data to a csv file
+
+ Args
+ ----
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels
+ coords: Numpy array containing xy coordinates of samples
+ file: Path to a csv file to save data to
+ """
+
+ # if there are no group labels, create a nan filled array
+ if groups is None:
+ groups = np.empty((y.shape[0]))
+ groups[:] = np.nan
+
+ training_data = np.column_stack([coords, X, y, groups])
+ np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+ """
+ Loads training data and labels from a csv file
+
+ Args
+ ----
+ file: Path to a csv file to save data to
+
+ Returns
+ -------
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels, or None
+ coords: Numpy array containing x,y coordinates of samples
+ """
+
+ training_data = np.loadtxt(file, delimiter=',')
+ n_cols = training_data.shape[1]
+ last_Xcol = n_cols-2
+
+ # check to see if last column contains group labels or nans
+ groups = training_data[:, -1]
+
+ # if all nans then set groups to None
+ if bool(np.isnan(groups).all()) is True:
+ groups = None
+
+ # fetch X and y
+ coords = training_data[:, 0:2]
+ X = training_data[:, 2:last_Xcol]
+ y = training_data[:, -2]
+
+ return(X, y, groups, coords)
+
+
+def maps_from_group(group):
+ """
+ Parse individual rasters into a list from an imagery group
+
+ Args
+ ----
+ group: String; GRASS imagery group
+
+ Returns
+ -------
+ maplist: List containing individual GRASS raster maps
+ map_names: List with print friendly map names
+ """
+ groupmaps = im.group(group=group, flags="g",
+ quiet=True, stdout_=PIPE).outputs.stdout
+
+ maplist = groupmaps.split(os.linesep)
+ maplist = maplist[0:len(maplist)-1]
+ map_names = []
+
+ for rastername in maplist:
+ map_names.append(rastername.split('@')[0])
+
+ return(maplist, map_names)
More information about the grass-commit
mailing list