[GRASS-SVN] r71157 - grass-addons/grass7/raster/r.learn.ml

Thu Jun 1 15:03:12 PDT 2017

Author: spawley
Date: 2017-06-01 15:03:12 -0700 (Thu, 01 Jun 2017)
New Revision: 71157

Modified:
   grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
   grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
   grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
   grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py
Log:
r.learn.ml close all raster maps properly

Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================

--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py	2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py	2017-06-01 22:03:12 UTC (rev 71157)
@@ -448,6 +448,7 @@
         from sklearn.utils import shuffle
         from sklearn import metrics
         from sklearn.metrics import make_scorer
+        from sklearn.calibration import CalibratedClassifierCV
     except:
         gscript.fatal("Scikit learn 0.18 or newer is not installed")
 
@@ -577,7 +578,7 @@
     if mode == 'classification':
         scoring = ['accuracy', 'precision', 'recall', 'f1', 'kappa',\
                    'balanced_accuracy']
-        search_scorer = make_scorer(metrics.cohen_kappa_score)
+        search_scorer = make_scorer(metrics.matthews_corrcoef)
     else:
         scoring = ['r2', 'neg_mean_squared_error']
         search_scorer = 'r2'
@@ -587,7 +588,7 @@
     # -------------------------------------------------------------------------
 
     # fetch individual raster names from group
-    maplist, map_names = maps_from_group(group)
+    maplist, _ = maps_from_group(group)
 
     if model_load == '':
 
@@ -712,7 +713,6 @@
         # ---------------------------------------------------------------------
         # define the preprocessing pipeline
         # ---------------------------------------------------------------------
-
         # standardization
         if norm_data is True and categorymaps is None:
             clf = Pipeline([('scaling', StandardScaler()),
@@ -760,12 +760,11 @@
         # ---------------------------------------------------------------------
         # classifier training
         # ---------------------------------------------------------------------
-
         gscript.message(os.linesep)
         gscript.message(('Fitting model using ' + classifier))
 
         # pass groups to fit parameter GroupKFold/GroupShuffleSplit and param_grid are present
-        if isinstance(inner, (GroupKFold, GroupShuffleSplit)) and any(param_grid) is True:
+        if isinstance(inner, (GroupKFold, GroupShuffleSplit)):
             if balance is True and classifier in (
                     'GradientBoostingClassifier', 'XGBClassifier'):
                 clf.fit(X=X, y=y, groups=group_id, sample_weight=class_weights)
@@ -792,7 +791,6 @@
         # ---------------------------------------------------------------------
         # cross-validation
         # ---------------------------------------------------------------------
-
         # If cv > 1 then use cross-validation to generate performance measures
         if cv > 1 and tune_only is not True:
             if mode == 'classification' and cv > np.histogram(
@@ -885,6 +883,13 @@
     if model_only is not True:
         gscript.message(os.linesep)
 
+        # recalibrate probabilities if classes have been balanced
+        if balance is True:
+            if any(param_grid) is True:
+                clf = clf.best_estimator_
+            clf = CalibratedClassifierCV(clf, cv=outer)
+            clf.fit(X, y)
+
         # predict classification/regression raster
         if prob_only is False:
             gscript.message('Predicting classification/regression raster...')

Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py	2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py	2017-06-01 22:03:12 UTC (rev 71157)
@@ -1,350 +1,350 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_crossval contains functions to perform
-model validation and permutation feature importances.
-
-"""
-
-from __future__ import absolute_import
-from copy import deepcopy
-
-import numpy as np
-import os
-from numpy.random import RandomState
-
-import grass.script as gscript
-
-def specificity_score(y_true, y_pred):
-    """
-    Calculate specificity score
-
-    Args
-    ----
-    y_true: 1D numpy array of truth values
-    y_pred: 1D numpy array of predicted classes
-
-    Returns
-    -------
-    specificity: specificity score
-    """
-    from sklearn.metrics import confusion_matrix
-
-    cm = confusion_matrix(y_true, y_pred)
-    tn = float(cm[0][0])
-    fp = float(cm[0][1])
-
-    return tn/(tn+fp)
-
-
-def varimp_permutation(estimator, X, y, n_permutations, scorer,
-                       n_jobs, random_state):
-    """
-    Method to perform permutation-based feature importance during
-    cross-validation (cross-validation is applied externally to this
-    method)
-
-    Procedure is:
-    1. Pass fitted estimator and test partition X y
-    2. Assess AUC on the test partition (bestauc)
-    3. Permute each variable and assess the difference between bestauc and
-       the messed-up variable
-    4. Repeat (3) for many random permutations
-    5. Average the repeats
-
-    Args
-    ----
-    estimator: estimator that has been fitted to a training partition
-    X, y: data and labels from a test partition
-    n_permutations: number of random permutations to apply
-    scorer: scikit-learn metric function to use
-    n_jobs: integer, number of processing cores
-    random_state: seed to pass to the numpy random.seed
-
-    Returns
-    -------
-    scores: scores for each predictor following permutation
-    """
-
-    from sklearn.externals.joblib import Parallel, delayed
-
-    # calculate score on original variables without permutation
-    # determine best metric type for binary/multiclass/regression scenarios
-    y_pred = estimator.predict(X)
-    best_score = scorer(y, y_pred)
-
-    # repeated permutations and return difference from best score per predictor
-    scores = Parallel(n_jobs=n_jobs)(
-        delayed(__permute)(
-            estimator, X, y, best_score, scorer, random_state)
-        for n in range(n_permutations))
-
-    # average the repetitions
-    scores = np.asarray(scores)
-    scores = scores.mean(axis=0)
-
-    return scores
-
-
-def __permute(estimator, X, y, best_score, scorer, random_state):
-    """
-    Permute each predictor and measure difference from best score
-
-    Args
-    ----
-    estimator: scikit learn estimator
-    X, y: data and labels from a test partition
-    best_score: best scorer obtained on unperturbed data
-    scorer: scoring method to use to measure importances
-    random_state: random seed
-
-    Returns
-    -------
-    scores: 2D numpy array of scores for each predictor following permutation
-    """
-
-    rstate = RandomState(random_state)
-
-    # permute each predictor variable and assess difference in score
-    scores = np.zeros(X.shape[1])
-
-    for i in range(X.shape[1]):
-        Xscram = np.copy(X)
-        Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])
-
-        # fit the model on the training data and predict the test data
-        y_pred = estimator.predict(Xscram)
-        scores[i] = best_score-scorer(y, y_pred)
-        if scores[i] < 0:
-            scores[i] = 0
-
-    return scores
-
-
-def __parallel_fit(estimator, X, y, groups, train_indices, test_indices, sample_weight):
-    """
-    Fit classifiers/regressors in parallel
-
-    Args
-    ----
-    estimator: scikit learn estimator
-    X, y: 2D and 1D numpy arrays of training data and labels
-    groups: 1D numpy array of len(y) containing group labels
-    train_indices, test_indices: 1D numpy arrays of indices to use for training/validation
-    sample_weight: 1D numpy array of len(y) containing weights to use during fitting
-                    applied only to XGBoost and Gradient Boosting classifiers
-    """
-
-    # create training and test folds
-    X_train, X_test = X[train_indices], X[test_indices]
-    y_train, y_test = y[train_indices], y[test_indices]
-    if groups is not None: groups_train = groups[train_indices]
-    else: groups_train = None
-
-    # subset training and test fold sample_weight
-    if sample_weight is not None: weights = sample_weight[train_indices]
-
-    # train estimator
-    if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
-        if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
-        else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
-    else:
-        if sample_weight is None: estimator.fit(X_train, y_train)
-        else: estimator.fit(X_train, y_train, sample_weight=weights)
-
-    return estimator
-
-
-def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
-                     scoring='accuracy', feature_importances=False,
-                     n_permutations=25, random_state=None, n_jobs=-1):
-    """
-    Stratified Kfold and GroupFold cross-validation using multiple
-    scoring metrics and permutation feature importances
-
-    Args
-    ----
-    estimator: Scikit learn estimator
-    X, y: 2D and 1D numpy array of training data and labels
-    groups: 1D numpy array containing group labels
-    sample_weight: 1D numpy array[n_samples,] of sample weights
-    cv: Integer of cross-validation folds or sklearn.model_selection object
-    scoring: List of performance metrics to use
-    feature_importances: Boolean to perform permutation-based importances
-    n_permutations: Number of permutations during feature importance
-    random_state: Seed to pass to the random number generator
-
-    Returns
-    -------
-    scores: Dict, containing lists of scores per cross-validation fold
-    byclass_scores: Dict, containing scores per class
-    fimp: 2D numpy array of permutation feature importances per feature
-    clf_resamples: List, fitted estimators
-    predictions: 2D numpy array with y_true, y_pred, fold
-    """
-
-    from sklearn import metrics
-    from sklearn.model_selection import StratifiedKFold
-    from sklearn.externals.joblib import Parallel, delayed
-
-    # -------------------------------------------------------------------------
-    # create copies of estimator and create cross-validation iterator
-    # -------------------------------------------------------------------------
-
-    # deepcopy estimator
-    clf = deepcopy(estimator)
-
-    # create model_selection method
-    if isinstance(cv, int):
-        cv = StratifiedKFold(n_splits=cv)
-
-    # -------------------------------------------------------------------------
-    # create dictionary of lists to store metrics
-    # -------------------------------------------------------------------------
-
-    if isinstance(scoring, basestring):
-        scoring = [scoring]
-    scores = dict.fromkeys(scoring)
-    scores = {key: [] for key, value in scores.iteritems()}
-    scoring_methods = {'accuracy': metrics.accuracy_score,
-                       'balanced_accuracy': metrics.recall_score,
-                       'average_precision': metrics.average_precision_score,
-                       'brier_loss': metrics.brier_score_loss,
-                       'kappa': metrics.cohen_kappa_score,
-                       'f1': metrics.f1_score,
-                       'fbeta': metrics.fbeta_score,
-                       'hamming_loss': metrics.hamming_loss,
-                       'jaccard_similarity': metrics.jaccard_similarity_score,
-                       'log_loss': metrics.log_loss,
-                       'matthews_corrcoef': metrics.matthews_corrcoef,
-                       'precision': metrics.precision_score,
-                       'recall': metrics.recall_score,
-                       'specificity': specificity_score,
-                       'roc_auc': metrics.roc_auc_score,
-                       'zero_one_loss': metrics.zero_one_loss,
-                       'r2': metrics.r2_score,
-                       'neg_mean_squared_error': metrics.mean_squared_error}
-
-    byclass_methods = {'f1': metrics.f1_score,
-                       'fbeta': metrics.fbeta_score,
-                       'precision': metrics.precision_score,
-                       'recall': metrics.recall_score}
-
-    # create dict to store byclass metrics results
-    n_classes = len(np.unique(y))
-    labels = np.unique(y)
-    byclass_scores = dict.fromkeys(byclass_methods)
-    byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
-
-    # remove any byclass_scorers that are not in the scoring list
-    byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
-
-    # check if remaining scorers are valid sklearn metrics
-    for i in scores.keys():
-        try:
-            list(scoring_methods.keys()).index(i)
-        except:
-            gscript.fatal(('Scoring ', i, ' is not a valid scoring method',
-                            os.linesep(),
-                            'Valid methods are: ', scoring_methods.keys()))
-
-    # set averaging type for global binary or multiclass scores
-    if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
-        average = 'binary'
-    else:
-        average = 'macro'
-
-    # create np array to store feature importance scores
-    if feature_importances is True:
-        fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
-        fimp[:] = np.nan
-    else:
-        fimp = None
-
-    # -------------------------------------------------------------------------
-    # extract cross-validation indices
-    # -------------------------------------------------------------------------
-
-    if groups is None:
-        k_fold = cv.split(X, y)
-    else:
-        k_fold = cv.split(X, y, groups=groups)
-
-    trains, tests = [], []
-    for train_indices, test_indices in k_fold:
-        trains.append(train_indices)
-        tests.append(test_indices)
-
-    # -------------------------------------------------------------------------
-    # Perform multiprocessing fitting of clf on each fold
-    # -------------------------------------------------------------------------
-
-    clf_resamples = Parallel(n_jobs=n_jobs)(
-        delayed(__parallel_fit)(clf, X, y, groups, train_indices,
-                              test_indices, sample_weight)
-        for train_indices, test_indices in zip(trains, tests))
-
-    # -------------------------------------------------------------------------
-    # loop through each fold and calculate performance metrics
-    # -------------------------------------------------------------------------
-
-    # store predictions and indices
-    predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
-
-    fold = 0
-    for train_indices, test_indices in zip(trains, tests):
-
-        # create training and test folds
-        X_train, X_test = X[train_indices], X[test_indices]
-        y_train, y_test = y[train_indices], y[test_indices]
-
-        # prediction of test fold
-        y_pred = clf_resamples[fold].predict(X_test)
-        predictions[test_indices, 0] = y_test
-        predictions[test_indices, 1] = y_pred
-        predictions[test_indices, 2] = fold
-
-        # calculate global performance metrics
-        for m in scores.keys():
-            # metrics that require probabilties
-            if m == 'brier_loss' or m == 'roc_auc':
-                y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
-                scores[m] = np.append(
-                    scores[m], scoring_methods[m](y_test, y_prob))
-
-            # metrics that have no averaging for multiclass
-            elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
-            or m == 'hamming_loss' or m == 'jaccard_similarity' \
-            or m == 'log_loss' or m == 'zero_one_loss' \
-            or m == 'matthews_corrcoef' or m == 'r2' \
-            or m == 'neg_mean_squared_error':
-                scores[m] = np.append(
-                    scores[m], scoring_methods[m](y_test, y_pred))
-
-            # balanced accuracy
-            elif m == 'balanced_accuracy':
-                scores[m] = np.append(
-                    scores[m], scoring_methods[m](
-                        y_test, y_pred, average='macro'))
-
-            # metrics that have averaging for multiclass
-            else:
-                scores[m] = np.append(
-                    scores[m], scoring_methods[m](
-                        y_test, y_pred, average=average))
-
-        # calculate per-class performance metrics
-        for key in byclass_scores.keys():
-            byclass_scores[key] = np.vstack((
-                byclass_scores[key], byclass_methods[key](
-                    y_test, y_pred, labels=labels, average=None)))
-
-        # feature importances using permutation
-        if feature_importances is True:
-            fimp[fold, :] = varimp_permutation(
-                clf_resamples[fold], X_test, y_test, n_permutations,
-                scoring_methods[scoring[0]], n_jobs, random_state)
-        fold += 1
-
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+"""
+The module rlearn_crossval contains functions to perform
+model validation and permutation feature importances.
+
+"""
+
+from __future__ import absolute_import
+from copy import deepcopy
+
+import numpy as np
+import os
+from numpy.random import RandomState
+
+import grass.script as gscript
+
+def specificity_score(y_true, y_pred):
+    """
+    Calculate specificity score
+
+    Args
+    ----
+    y_true: 1D numpy array of truth values
+    y_pred: 1D numpy array of predicted classes
+
+    Returns
+    -------
+    specificity: specificity score
+    """
+    from sklearn.metrics import confusion_matrix
+
+    cm = confusion_matrix(y_true, y_pred)
+    tn = float(cm[0][0])
+    fp = float(cm[0][1])
+
+    return tn/(tn+fp)
+
+
+def varimp_permutation(estimator, X, y, n_permutations, scorer,
+                       n_jobs, random_state):
+    """
+    Method to perform permutation-based feature importance during
+    cross-validation (cross-validation is applied externally to this
+    method)
+
+    Procedure is:
+    1. Pass fitted estimator and test partition X y
+    2. Assess AUC on the test partition (bestauc)
+    3. Permute each variable and assess the difference between bestauc and
+       the messed-up variable
+    4. Repeat (3) for many random permutations
+    5. Average the repeats
+
+    Args
+    ----
+    estimator: estimator that has been fitted to a training partition
+    X, y: data and labels from a test partition
+    n_permutations: number of random permutations to apply
+    scorer: scikit-learn metric function to use
+    n_jobs: integer, number of processing cores
+    random_state: seed to pass to the numpy random.seed
+
+    Returns
+    -------
+    scores: scores for each predictor following permutation
+    """
+
+    from sklearn.externals.joblib import Parallel, delayed
+
+    # calculate score on original variables without permutation
+    # determine best metric type for binary/multiclass/regression scenarios
+    y_pred = estimator.predict(X)
+    best_score = scorer(y, y_pred)
+
+    # repeated permutations and return difference from best score per predictor
+    scores = Parallel(n_jobs=n_jobs)(
+        delayed(__permute)(
+            estimator, X, y, best_score, scorer, random_state)
+        for n in range(n_permutations))
+
+    # average the repetitions
+    scores = np.asarray(scores)
+    scores = scores.mean(axis=0)
+
+    return scores
+
+
+def __permute(estimator, X, y, best_score, scorer, random_state):
+    """
+    Permute each predictor and measure difference from best score
+
+    Args
+    ----
+    estimator: scikit learn estimator
+    X, y: data and labels from a test partition
+    best_score: best scorer obtained on unperturbed data
+    scorer: scoring method to use to measure importances
+    random_state: random seed
+
+    Returns
+    -------
+    scores: 2D numpy array of scores for each predictor following permutation
+    """
+
+    rstate = RandomState(random_state)
+
+    # permute each predictor variable and assess difference in score
+    scores = np.zeros(X.shape[1])
+
+    for i in range(X.shape[1]):
+        Xscram = np.copy(X)
+        Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])
+
+        # fit the model on the training data and predict the test data
+        y_pred = estimator.predict(Xscram)
+        scores[i] = best_score-scorer(y, y_pred)
+        if scores[i] < 0:
+            scores[i] = 0
+
+    return scores
+
+
+def __parallel_fit(estimator, X, y, groups, train_indices, test_indices, sample_weight):
+    """
+    Fit classifiers/regressors in parallel
+
+    Args
+    ----
+    estimator: scikit learn estimator
+    X, y: 2D and 1D numpy arrays of training data and labels
+    groups: 1D numpy array of len(y) containing group labels
+    train_indices, test_indices: 1D numpy arrays of indices to use for training/validation
+    sample_weight: 1D numpy array of len(y) containing weights to use during fitting
+                    applied only to XGBoost and Gradient Boosting classifiers
+    """
+
+    # create training and test folds
+    X_train, X_test = X[train_indices], X[test_indices]
+    y_train, y_test = y[train_indices], y[test_indices]
+    if groups is not None: groups_train = groups[train_indices]
+    else: groups_train = None
+
+    # subset training and test fold sample_weight
+    if sample_weight is not None: weights = sample_weight[train_indices]
+
+    # train estimator
+    if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
+        if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
+        else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
+    else:
+        if sample_weight is None: estimator.fit(X_train, y_train)
+        else: estimator.fit(X_train, y_train, sample_weight=weights)
+
+    return estimator
+
+
+def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
+                     scoring='accuracy', feature_importances=False,
+                     n_permutations=25, random_state=None, n_jobs=-1):
+    """
+    Stratified Kfold and GroupFold cross-validation using multiple
+    scoring metrics and permutation feature importances
+
+    Args
+    ----
+    estimator: Scikit learn estimator
+    X, y: 2D and 1D numpy array of training data and labels
+    groups: 1D numpy array containing group labels
+    sample_weight: 1D numpy array[n_samples,] of sample weights
+    cv: Integer of cross-validation folds or sklearn.model_selection object
+    scoring: List of performance metrics to use
+    feature_importances: Boolean to perform permutation-based importances
+    n_permutations: Number of permutations during feature importance
+    random_state: Seed to pass to the random number generator
+
+    Returns
+    -------
+    scores: Dict, containing lists of scores per cross-validation fold
+    byclass_scores: Dict, containing scores per class
+    fimp: 2D numpy array of permutation feature importances per feature
+    clf_resamples: List, fitted estimators
+    predictions: 2D numpy array with y_true, y_pred, fold
+    """
+
+    from sklearn import metrics
+    from sklearn.model_selection import StratifiedKFold
+    from sklearn.externals.joblib import Parallel, delayed
+
+    # -------------------------------------------------------------------------
+    # create copies of estimator and create cross-validation iterator
+    # -------------------------------------------------------------------------
+
+    # deepcopy estimator
+    clf = deepcopy(estimator)
+
+    # create model_selection method
+    if isinstance(cv, int):
+        cv = StratifiedKFold(n_splits=cv)
+
+    # -------------------------------------------------------------------------
+    # create dictionary of lists to store metrics
+    # -------------------------------------------------------------------------
+
+    if isinstance(scoring, basestring):
+        scoring = [scoring]
+    scores = dict.fromkeys(scoring)
+    scores = {key: [] for key, value in scores.iteritems()}
+    scoring_methods = {'accuracy': metrics.accuracy_score,
+                       'balanced_accuracy': metrics.recall_score,
+                       'average_precision': metrics.average_precision_score,
+                       'brier_loss': metrics.brier_score_loss,
+                       'kappa': metrics.cohen_kappa_score,
+                       'f1': metrics.f1_score,
+                       'fbeta': metrics.fbeta_score,
+                       'hamming_loss': metrics.hamming_loss,
+                       'jaccard_similarity': metrics.jaccard_similarity_score,
+                       'log_loss': metrics.log_loss,
+                       'matthews_corrcoef': metrics.matthews_corrcoef,
+                       'precision': metrics.precision_score,
+                       'recall': metrics.recall_score,
+                       'specificity': specificity_score,
+                       'roc_auc': metrics.roc_auc_score,
+                       'zero_one_loss': metrics.zero_one_loss,
+                       'r2': metrics.r2_score,
+                       'neg_mean_squared_error': metrics.mean_squared_error}
+
+    byclass_methods = {'f1': metrics.f1_score,
+                       'fbeta': metrics.fbeta_score,
+                       'precision': metrics.precision_score,
+                       'recall': metrics.recall_score}
+
+    # create dict to store byclass metrics results
+    n_classes = len(np.unique(y))
+    labels = np.unique(y)
+    byclass_scores = dict.fromkeys(byclass_methods)
+    byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
+
+    # remove any byclass_scorers that are not in the scoring list
+    byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
+
+    # check if remaining scorers are valid sklearn metrics
+    for i in scores.keys():
+        try:
+            list(scoring_methods.keys()).index(i)
+        except:
+            gscript.fatal(('Scoring ', i, ' is not a valid scoring method',
+                            os.linesep(),
+                            'Valid methods are: ', scoring_methods.keys()))
+
+    # set averaging type for global binary or multiclass scores
+    if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+        average = 'binary'
+    else:
+        average = 'macro'
+
+    # create np array to store feature importance scores
+    if feature_importances is True:
+        fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
+        fimp[:] = np.nan
+    else:
+        fimp = None
+
+    # -------------------------------------------------------------------------
+    # extract cross-validation indices
+    # -------------------------------------------------------------------------
+
+    if groups is None:
+        k_fold = cv.split(X, y)
+    else:
+        k_fold = cv.split(X, y, groups=groups)
+
+    trains, tests = [], []
+    for train_indices, test_indices in k_fold:
+        trains.append(train_indices)
+        tests.append(test_indices)
+
+    # -------------------------------------------------------------------------
+    # Perform multiprocessing fitting of clf on each fold
+    # -------------------------------------------------------------------------
+
+    clf_resamples = Parallel(n_jobs=n_jobs)(
+        delayed(__parallel_fit)(clf, X, y, groups, train_indices,
+                              test_indices, sample_weight)
+        for train_indices, test_indices in zip(trains, tests))
+
+    # -------------------------------------------------------------------------
+    # loop through each fold and calculate performance metrics
+    # -------------------------------------------------------------------------
+
+    # store predictions and indices
+    predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
+
+    fold = 0
+    for train_indices, test_indices in zip(trains, tests):
+
+        # create training and test folds
+        X_train, X_test = X[train_indices], X[test_indices]
+        y_train, y_test = y[train_indices], y[test_indices]
+
+        # prediction of test fold
+        y_pred = clf_resamples[fold].predict(X_test)
+        predictions[test_indices, 0] = y_test
+        predictions[test_indices, 1] = y_pred
+        predictions[test_indices, 2] = fold
+
+        # calculate global performance metrics
+        for m in scores.keys():
+            # metrics that require probabilties
+            if m == 'brier_loss' or m == 'roc_auc':
+                y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
+                scores[m] = np.append(
+                    scores[m], scoring_methods[m](y_test, y_prob))
+
+            # metrics that have no averaging for multiclass
+            elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
+            or m == 'hamming_loss' or m == 'jaccard_similarity' \
+            or m == 'log_loss' or m == 'zero_one_loss' \
+            or m == 'matthews_corrcoef' or m == 'r2' \
+            or m == 'neg_mean_squared_error':
+                scores[m] = np.append(
+                    scores[m], scoring_methods[m](y_test, y_pred))
+
+            # balanced accuracy
+            elif m == 'balanced_accuracy':
+                scores[m] = np.append(
+                    scores[m], scoring_methods[m](
+                        y_test, y_pred, average='macro'))
+
+            # metrics that have averaging for multiclass
+            else:
+                scores[m] = np.append(
+                    scores[m], scoring_methods[m](
+                        y_test, y_pred, average=average))
+
+        # calculate per-class performance metrics
+        for key in byclass_scores.keys():
+            byclass_scores[key] = np.vstack((
+                byclass_scores[key], byclass_methods[key](
+                    y_test, y_pred, labels=labels, average=None)))
+
+        # feature importances using permutation
+        if feature_importances is True:
+            fimp[fold, :] = varimp_permutation(
+                clf_resamples[fold], X_test, y_test, n_permutations,
+                scoring_methods[scoring[0]], n_jobs, random_state)
+        fold += 1
+
     return(scores, byclass_scores, fimp, clf_resamples, predictions)
\ No newline at end of file

Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py	2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py	2017-06-01 22:03:12 UTC (rev 71157)
@@ -1,479 +1,486 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_rasters contains functions to
-extract training data from GRASS rasters.
-
-"""
-
-
-from __future__ import absolute_import
-
-import numpy as np
-import tempfile
-
-import grass.script as gscript
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
-from grass.pygrass.vector import VectorTopo
-from grass.pygrass.utils import get_raster_for_points, pixel2coor
-
-
-def extract(response, predictors, lowmem=False):
-    """
-    Samples a list of GRASS rasters using a labelled raster
-    Per raster sampling
-
-    Args
-    ----
-    response: String; GRASS raster with labelled pixels
-    predictors: List of GRASS rasters containing explanatory variables
-    lowmem: Boolean, use numpy memmap to query predictors
-
-    Returns
-    -------
-    training_data: Numpy array of extracted raster values
-    training_labels: Numpy array of labels
-    is_train: Row and Columns of label positions
-    """
-
-    current = Region()
-
-    # open response raster as rasterrow and read as np array
-    if RasterRow(response).exist() is True:
-        roi_gr = RasterRow(response)
-        roi_gr.open('r')
-
-        if lowmem is False:
-            response_np = np.array(roi_gr)
-        else:
-            response_np = np.memmap(
-                tempfile.NamedTemporaryFile(),
-                dtype='float32', mode='w+',
-                shape=(current.rows, current.cols))
-            response_np[:] = np.array(roi_gr)[:]
-    else:
-        gscript.fatal("GRASS response raster does not exist.... exiting")
-
-    # determine number of predictor rasters
-    n_features = len(predictors)
-
-    # check to see if all predictors exist
-    for i in range(n_features):
-        if RasterRow(predictors[i]).exist() is not True:
-            gscript.fatal("GRASS raster " + predictors[i] +
-                          " does not exist.... exiting")
-
-    # check if any of those pixels are labelled (not equal to nodata)
-    # can use even if roi is FCELL because nodata will be nan
-    is_train = np.nonzero(response_np > -2147483648)
-    training_labels = response_np[is_train]
-    n_labels = np.array(is_train).shape[1]
-
-    # Create a zero numpy array of len training labels
-    if lowmem is False:
-        training_data = np.zeros((n_labels, n_features))
-    else:
-        training_data = np.memmap(tempfile.NamedTemporaryFile(),
-                                  dtype='float32', mode='w+',
-                                  shape=(n_labels, n_features))
-
-    # Loop through each raster and sample pixel values at training indexes
-    if lowmem is True:
-        feature_np = np.memmap(tempfile.NamedTemporaryFile(),
-                               dtype='float32', mode='w+',
-                               shape=(current.rows, current.cols))
-
-    for f in range(n_features):
-        predictor_gr = RasterRow(predictors[f])
-        predictor_gr.open('r')
-
-        if lowmem is False:
-            feature_np = np.array(predictor_gr)
-        else:
-            feature_np[:] = np.array(predictor_gr)[:]
-
-        training_data[0:n_labels, f] = feature_np[is_train]
-
-        # close each predictor map
-        predictor_gr.close()
-
-    # convert any CELL maps no datavals to NaN in the training data
-    for i in range(n_features):
-        training_data[training_data[:, i] == -2147483648] = np.nan
-
-    # convert indexes of training pixels from tuple to n*2 np array
-    is_train = np.array(is_train).T
-    for i in range(is_train.shape[0]):
-        is_train[i, :] = np.array(pixel2coor(tuple(is_train[i]), current))
-
-    # close the response map
-    roi_gr.close()
-
-    return(training_data, training_labels, is_train)
-
-
-def extract_points(gvector, grasters, field):
-    """
-    Extract values from grass rasters using vector points input
-
-    Args
-    ----
-    gvector: character, name of grass points vector
-    grasters: list of names of grass raster to query
-    field: character, name of field in table to use as response variable
-
-    Returns
-    -------
-    X: 2D numpy array of training data
-    y: 1D numpy array with the response variable
-    coordinates: 2D numpy array of sample coordinates
-    """
-    # open grass vector
-    points = VectorTopo(gvector.split('@')[0])
-    points.open('r')
-
-    # create link to attribute table
-    points.dblinks.by_name(name=gvector)
-
-    # extract table field to numpy array
-    table = points.table
-    cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
-    y = np.array([np.isnan if c is None else c[0] for c in cur])
-
-    # extract raster data
-    X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
-    for i, raster in enumerate(grasters):
-        rio = RasterRow(raster)
-        values = np.asarray(get_raster_for_points(points, rio))
-        coordinates = values[:, 1:3]
-        X[:, i] = values[:, 3]
-
-    # set any grass integer nodata values to NaN
-    X[X == -2147483648] = np.nan
-
-    # remove missing response data
-    X = X[~np.isnan(y)]
-    coordinates = coordinates[~np.isnan(y)]
-    y = y[~np.isnan(y)]
-
-    # close
-    points.close()
-
-    return(X, y, coordinates)
-
-
-def predict(estimator, predictors, output, predict_type='raw',
-            index=None, rowincr=25, n_jobs=-2):
-    """
-    Prediction on list of GRASS rasters using a fitted scikit learn model
-
-    Args
-    ----
-    estimator: scikit-learn estimator object
-    predictors: list of GRASS rasters
-    output: Name of GRASS raster to output classification results
-    predict_type: character, 'raw' for classification/regression;
-                  'prob' for class probabilities
-    index: Optional, list of class indices to export
-    rowincr: Integer of raster rows to process at one time
-    n_jobs: Number of processing cores
-    """
-
-    from sklearn.externals.joblib import Parallel, delayed
-
-    # convert potential single index to list
-    if isinstance(index, int): index = [index]
-
-    # open predictors as list of rasterrow objects
-    current = Region()
-    n_features = len(predictors)
-    rasstack = [0] * n_features
-
-    for i in range(n_features):
-        rasstack[i] = RasterRow(predictors[i])
-        if rasstack[i].exist() is True:
-            rasstack[i].open('r')
-        else:
-            gscript.fatal("GRASS raster " + predictors[i] +
-                          " does not exist.... exiting")
-
-    # -------------------------------------------------------------------------
-    # turn off multiprocessing for multi-threaded classifiers
-    # -------------------------------------------------------------------------
-
-    # first unwrap the estimator from any potential pipelines or gridsearchCV
-    if type(estimator).__name__ == 'Pipeline':
-        clf_type = estimator.named_steps['classifier']
-    else:
-        clf_type = estimator
-
-    if type(clf_type).__name__ == 'GridSearchCV' or \
-        type(clf_type).__name__ == 'RandomizedSearchCV':
-        clf_type = clf_type.best_estimator_
-
-    # check name against already multithreaded classifiers
-    if n_jobs == 1 or type(clf_type).__name__ in ['RandomForestClassifier',
-                                                  'RandomForestRegressor',
-                                                  'ExtraTreesClassifier',
-                                                  'ExtraTreesRegressor',
-                                                  'KNeighborsClassifier',
-                                                  'XGBClassifier',
-                                                  'XGBRegressor']:
-        # ---------------------------------------------------------------------
-        # sequential prediction
-        # ---------------------------------------------------------------------
-
-        # Prediction using blocks of rows per iteration
-        for rowblock in range(0, current.rows, rowincr):
-            gscript.percent(rowblock, current.rows, rowincr)
-
-            # check that the row increment does not exceed the number of rows
-            if rowblock+rowincr > current.rows:
-                rowincr = current.rows - rowblock
-            img_np_row = np.zeros((rowincr, current.cols, n_features))
-
-            # loop through each row, and each band and add to 2D img_np_row
-            for row in range(rowblock, rowblock+rowincr, 1):
-                for band in range(n_features):
-                    img_np_row[row-rowblock, :, band] = \
-                        np.array(rasstack[band][row])
-
-            # create mask
-            img_np_row[img_np_row == -2147483648] = np.nan
-            mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
-            for feature in range(n_features):
-                invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
-                mask[invalid_indexes] = np.nan
-
-            # reshape each row-band matrix into a n*m array
-            nsamples = rowincr * current.cols
-            flat_pixels = img_np_row.reshape((nsamples, n_features))
-
-            # remove NaNs prior to passing to scikit-learn predict
-            flat_pixels = np.nan_to_num(flat_pixels)
-
-            # perform prediction for classification/regression
-            if predict_type == 'raw':
-                result = estimator.predict(flat_pixels)
-                result = result.reshape((rowincr, current.cols))
-
-                # determine nodata value and grass raster type
-                if result.dtype == 'float':
-                    nodata = np.nan
-                    ftype = 'FCELL'
-                else:
-                    nodata = -2147483648
-                    ftype = 'CELL'
-
-                # replace NaN values so that the prediction does not have a border
-                result[np.nonzero(np.isnan(mask))] = nodata
-
-                # on first iteration create the RasterRow object
-                if rowblock == 0:
-                    if predict_type == 'raw':
-                        classification = RasterRow(output)
-                        classification.open('w', ftype, overwrite=True)
-
-                # write the classification result
-                for row in range(rowincr):
-                    newrow = Buffer((result.shape[1],), mtype=ftype)
-                    newrow[:] = result[row, :]
-                    classification.put_row(newrow)
-
-            # perform prediction for class probabilities
-            if predict_type == 'prob':
-                result_proba = estimator.predict_proba(flat_pixels)
-
-                # on first loop determine number of probability classes
-                # and open rasterrow objects for writing
-                if rowblock == 0:
-                    if index is None:
-                        index = range(result_proba.shape[1])
-                        n_classes = len(index)
-                    else:
-                        n_classes = len(np.unique(index))
-
-                    # create and open RasterRow objects for probabilities
-                    prob_out_raster = [0] * n_classes
-                    prob = [0] * n_classes
-                    for iclass, label in enumerate(index):
-                        prob_out_raster[iclass] = output + '_classPr' + str(label)
-                        prob[iclass] = RasterRow(prob_out_raster[iclass])
-                        prob[iclass].open('w', 'FCELL', overwrite=True)
-
-                for iclass, label in enumerate(index):
-                    result_proba_class = result_proba[:, label]
-                    result_proba_class = result_proba_class.reshape((rowincr, current.cols))
-                    result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
-
-                    for row in range(rowincr):
-                        newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
-                        newrow[:] = result_proba_class[row, :]
-                        prob[iclass].put_row(newrow)
-    else:
-
-        # ---------------------------------------------------------------------
-        # parallel prediction
-        # ---------------------------------------------------------------------
-
-        # create lists of row increments
-        row_mins, row_maxs = [], []
-        for row in range(0, current.rows, rowincr):
-            if row+rowincr > current.rows:
-                rowincr = current.rows - row
-            row_mins.append(row)
-            row_maxs.append(row+rowincr)
-
-        # perform predictions on lists of row increments in parallel
-        predictions = Parallel(n_jobs=n_jobs, max_nbytes=None)(
-            delayed(__predict_parallel)
-            (estimator, predictors, predict_type, current, row_min, row_max)
-            for row_min, row_max in zip(row_mins, row_maxs))
-
-        # unpack the results
-        results = []
-        ftypes = []
-        for block in predictions:
-            results.append(block[0])
-            ftypes.append(block[1])
-
-        # -------------------------------------------------------------------------
-        #  writing of predicted results for classification
-        # -------------------------------------------------------------------------
-        if predict_type == 'raw':
-            classification = RasterRow(output)
-            classification.open('w', ftypes[0], overwrite=True)
-
-            # write the classification result
-            for result_block in results:
-                for row in range(result_block.shape[0]):
-                    newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
-                    newrow[:] = result_block[row, :]
-                    classification.put_row(newrow)
-
-        # -------------------------------------------------------------------------
-        # writing of predicted results for probabilities
-        # -------------------------------------------------------------------------
-        if predict_type == 'prob':
-            # determine number of classes
-            if index is None:
-                index = range(results[0].shape[2])
-                n_classes = len(index)
-            else:
-                n_classes = len(np.unique(index))
-
-            # create and open RasterRow objects for probabilities
-            prob_out_raster = [0] * n_classes
-            prob = [0] * n_classes
-            for iclass, label in enumerate(index):
-                prob_out_raster[iclass] = output + '_classPr' + str(label)
-                prob[iclass] = RasterRow(prob_out_raster[iclass])
-                prob[iclass].open('w', 'FCELL', overwrite=True)
-
-            # write the class probability results
-            for results_proba_block in results:
-                for iclass, label in enumerate(index):
-                    result_proba_class = results_proba_block[:, :, label]
-
-                    for row in range(result_proba_class.shape[0]):
-                        newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
-                        newrow[:] = result_proba_class[row, :]
-                        prob[iclass].put_row(newrow)
-
-    # -------------------------------------------------------------------------
-    # close all maps
-    # -------------------------------------------------------------------------
-    if predict_type == 'raw': classification.close()
-    if predict_type == 'prob':
-        try:
-            for iclass in range(n_classes):
-                prob[iclass].close()
-        except:
-            pass
-
-
-def __predict_parallel(estimator, predictors, predict_type, current, row_min, row_max):
-    """
-    Performs prediction on range of rows in grass rasters
-
-    Args
-    ----
-    estimator: scikit-learn estimator object
-    predictors: list of GRASS rasters
-    predict_type: character, 'raw' for classification/regression;
-                  'prob' for class probabilities
-    current: current region settings
-    row_min, row_max: Range of rows of grass rasters to perform predictions
-
-    Returns
-    -------
-    result: 2D (classification) or 3D numpy array (class probabilities) of predictions
-    ftypes: data storage type
-    """
-
-    # initialize output
-    result, ftype, mask = None, None, None
-
-    # open grass rasters
-    n_features = len(predictors)
-    rasstack = [0] * n_features
-
-    for i in range(n_features):
-        rasstack[i] = RasterRow(predictors[i])
-        if rasstack[i].exist() is True:
-            rasstack[i].open('r')
-        else:
-            gscript.fatal("GRASS raster " + predictors[i] +
-                          " does not exist.... exiting")
-
-    # loop through each row, and each band and add to 2D img_np_row
-    img_np_row = np.zeros((row_max-row_min, current.cols, n_features))
-    for row in range(row_min, row_max):
-        for band in range(n_features):
-            img_np_row[row-row_min, :, band] = np.array(rasstack[band][row])
-
-    # create mask
-    img_np_row[img_np_row == -2147483648] = np.nan
-    mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
-    for feature in range(n_features):
-        invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
-        mask[invalid_indexes] = np.nan
-
-    # reshape each row-band matrix into a n*m array
-    nsamples = (row_max-row_min) * current.cols
-    flat_pixels = img_np_row.reshape((nsamples, n_features))
-
-    # remove NaNs prior to passing to scikit-learn predict
-    flat_pixels = np.nan_to_num(flat_pixels)
-
-    # perform prediction for classification/regression
-    if predict_type == 'raw':
-        result = estimator.predict(flat_pixels)
-        result = result.reshape((row_max-row_min, current.cols))
-
-        # determine nodata value and grass raster type
-        if result.dtype == 'float':
-            nodata = np.nan
-            ftype = 'FCELL'
-        else:
-            nodata = -2147483648
-            ftype = 'CELL'
-
-        # replace NaN values so that the prediction does not have a border
-        result[np.nonzero(np.isnan(mask))] = nodata
-
-    # perform prediction for class probabilities
-    if predict_type == 'prob':
-        result = estimator.predict_proba(flat_pixels)
-        result = result.reshape((row_max-row_min, current.cols, result.shape[1]))
-        result[np.nonzero(np.isnan(mask))] = np.nan
-
-    # close maps
-    for i in range(n_features):
-        rasstack[i].close()
-
-    return (result, ftype)
-
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+"""
+The module rlearn_rasters contains functions to
+extract training data from GRASS rasters.
+
+"""
+
+
+from __future__ import absolute_import
+
+import numpy as np
+import tempfile
+
+import grass.script as gscript
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.vector import VectorTopo
+from grass.pygrass.utils import get_raster_for_points, pixel2coor
+
+
+def extract(response, predictors, lowmem=False):
+    """
+    Samples a list of GRASS rasters using a labelled raster
+    Per raster sampling
+
+    Args
+    ----
+    response: String; GRASS raster with labelled pixels
+    predictors: List of GRASS rasters containing explanatory variables
+    lowmem: Boolean, use numpy memmap to query predictors
+
+    Returns
+    -------
+    training_data: Numpy array of extracted raster values
+    training_labels: Numpy array of labels
+    is_train: Row and Columns of label positions
+    """
+
+    current = Region()
+
+    # open response raster as rasterrow and read as np array
+    if RasterRow(response).exist() is True:
+        roi_gr = RasterRow(response)
+        roi_gr.open('r')
+
+        if lowmem is False:
+            response_np = np.array(roi_gr)
+        else:
+            response_np = np.memmap(
+                tempfile.NamedTemporaryFile(),
+                dtype='float32', mode='w+',
+                shape=(current.rows, current.cols))
+            response_np[:] = np.array(roi_gr)[:]
+    else:
+        gscript.fatal("GRASS response raster does not exist.... exiting")
+
+    # determine number of predictor rasters
+    n_features = len(predictors)
+
+    # check to see if all predictors exist
+    for i in range(n_features):
+        if RasterRow(predictors[i]).exist() is not True:
+            gscript.fatal("GRASS raster " + predictors[i] +
+                          " does not exist.... exiting")
+
+    # check if any of those pixels are labelled (not equal to nodata)
+    # can use even if roi is FCELL because nodata will be nan
+    is_train = np.nonzero(response_np > -2147483648)
+    training_labels = response_np[is_train]
+    n_labels = np.array(is_train).shape[1]
+
+    # Create a zero numpy array of len training labels
+    if lowmem is False:
+        training_data = np.zeros((n_labels, n_features))
+    else:
+        training_data = np.memmap(tempfile.NamedTemporaryFile(),
+                                  dtype='float32', mode='w+',
+                                  shape=(n_labels, n_features))
+
+    # Loop through each raster and sample pixel values at training indexes
+    if lowmem is True:
+        feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+                               dtype='float32', mode='w+',
+                               shape=(current.rows, current.cols))
+
+    for f in range(n_features):
+        predictor_gr = RasterRow(predictors[f])
+        predictor_gr.open('r')
+
+        if lowmem is False:
+            feature_np = np.array(predictor_gr)
+        else:
+            feature_np[:] = np.array(predictor_gr)[:]
+
+        training_data[0:n_labels, f] = feature_np[is_train]
+
+        # close each predictor map
+        predictor_gr.close()
+
+    # convert any CELL maps no datavals to NaN in the training data
+    for i in range(n_features):
+        training_data[training_data[:, i] == -2147483648] = np.nan
+
+    # convert indexes of training pixels from tuple to n*2 np array
+    is_train = np.array(is_train).T
+    for i in range(is_train.shape[0]):
+        is_train[i, :] = np.array(pixel2coor(tuple(is_train[i]), current))
+
+    # close the response map
+    roi_gr.close()
+
+    return(training_data, training_labels, is_train)
+
+
+def extract_points(gvector, grasters, field):
+    """
+    Extract values from grass rasters using vector points input
+
+    Args
+    ----
+    gvector: character, name of grass points vector
+    grasters: list of names of grass raster to query
+    field: character, name of field in table to use as response variable
+
+    Returns
+    -------
+    X: 2D numpy array of training data
+    y: 1D numpy array with the response variable
+    coordinates: 2D numpy array of sample coordinates
+    """
+    # open grass vector
+    points = VectorTopo(gvector.split('@')[0])
+    points.open('r')
+
+    # create link to attribute table
+    points.dblinks.by_name(name=gvector)
+
+    # extract table field to numpy array
+    table = points.table
+    cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
+    y = np.array([np.isnan if c is None else c[0] for c in cur])
+
+    # extract raster data
+    X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
+    for i, raster in enumerate(grasters):
+        rio = RasterRow(raster)
+        values = np.asarray(get_raster_for_points(points, rio))
+        coordinates = values[:, 1:3]
+        X[:, i] = values[:, 3]
+        rio.close()
+
+    # set any grass integer nodata values to NaN
+    X[X == -2147483648] = np.nan
+
+    # remove missing response data
+    X = X[~np.isnan(y)]
+    coordinates = coordinates[~np.isnan(y)]
+    y = y[~np.isnan(y)]
+
+    # close
+    points.close()
+
+    return(X, y, coordinates)
+
+
+def predict(estimator, predictors, output, predict_type='raw',
+            index=None, rowincr=25, n_jobs=-2):
+    """
+    Prediction on list of GRASS rasters using a fitted scikit learn model
+
+    Args
+    ----
+    estimator: scikit-learn estimator object
+    predictors: list of GRASS rasters
+    output: Name of GRASS raster to output classification results
+    predict_type: character, 'raw' for classification/regression;
+                  'prob' for class probabilities
+    index: Optional, list of class indices to export
+    rowincr: Integer of raster rows to process at one time
+    n_jobs: Number of processing cores
+    """
+
+    from sklearn.externals.joblib import Parallel, delayed
+
+    # convert potential single index to list
+    if isinstance(index, int): index = [index]
+
+    # open predictors as list of rasterrow objects
+    current = Region()
+    n_features = len(predictors)
+
+    # -------------------------------------------------------------------------
+    # turn off multiprocessing for multi-threaded classifiers
+    # -------------------------------------------------------------------------
+
+    # first unwrap the estimator from any potential pipelines or gridsearchCV
+    if type(estimator).__name__ == 'Pipeline':
+        clf_type = estimator.named_steps['classifier']
+    else:
+        clf_type = estimator
+
+    if type(clf_type).__name__ == 'GridSearchCV' or \
+        type(clf_type).__name__ == 'RandomizedSearchCV':
+        clf_type = clf_type.best_estimator_
+
+    # check name against already multithreaded classifiers
+    if n_jobs == 1 or type(clf_type).__name__ in ['RandomForestClassifier',
+                                                  'RandomForestRegressor',
+                                                  'ExtraTreesClassifier',
+                                                  'ExtraTreesRegressor',
+                                                  'KNeighborsClassifier',
+                                                  'XGBClassifier',
+                                                  'XGBRegressor']:
+        # ---------------------------------------------------------------------
+        # sequential prediction
+        # ---------------------------------------------------------------------
+
+        rasstack = [0] * n_features
+
+        for i in range(n_features):
+            rasstack[i] = RasterRow(predictors[i])
+            if rasstack[i].exist() is True:
+                rasstack[i].open('r')
+            else:
+                gscript.fatal("GRASS raster " + predictors[i] +
+                            " does not exist.... exiting")
+
+        # Prediction using blocks of rows per iteration
+        for rowblock in range(0, current.rows, rowincr):
+            gscript.percent(rowblock, current.rows, rowincr)
+
+            # check that the row increment does not exceed the number of rows
+            if rowblock+rowincr > current.rows:
+                rowincr = current.rows - rowblock
+            img_np_row = np.zeros((rowincr, current.cols, n_features))
+
+            # loop through each row, and each band and add to 2D img_np_row
+            for row in range(rowblock, rowblock+rowincr, 1):
+                for band in range(n_features):
+                    img_np_row[row-rowblock, :, band] = \
+                        np.array(rasstack[band][row])
+
+            # create mask
+            img_np_row[img_np_row == -2147483648] = np.nan
+            mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+            for feature in range(n_features):
+                invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+                mask[invalid_indexes] = np.nan
+
+            # reshape each row-band matrix into a n*m array
+            nsamples = rowincr * current.cols
+            flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+            # remove NaNs prior to passing to scikit-learn predict
+            flat_pixels = np.nan_to_num(flat_pixels)
+
+            # perform prediction for classification/regression
+            if predict_type == 'raw':
+                result = estimator.predict(flat_pixels)
+                result = result.reshape((rowincr, current.cols))
+
+                # determine nodata value and grass raster type
+                if result.dtype == 'float':
+                    nodata = np.nan
+                    ftype = 'FCELL'
+                else:
+                    nodata = -2147483648
+                    ftype = 'CELL'
+
+                # replace NaN values so that the prediction does not have a border
+                result[np.nonzero(np.isnan(mask))] = nodata
+
+                # on first iteration create the RasterRow object
+                if rowblock == 0:
+                    if predict_type == 'raw':
+                        classification = RasterRow(output)
+                        classification.open('w', ftype, overwrite=True)
+
+                # write the classification result
+                for row in range(rowincr):
+                    newrow = Buffer((result.shape[1],), mtype=ftype)
+                    newrow[:] = result[row, :]
+                    classification.put_row(newrow)
+
+            # perform prediction for class probabilities
+            if predict_type == 'prob':
+                result_proba = estimator.predict_proba(flat_pixels)
+
+                # on first loop determine number of probability classes
+                # and open rasterrow objects for writing
+                if rowblock == 0:
+                    if index is None:
+                        index = range(result_proba.shape[1])
+                        n_classes = len(index)
+                    else:
+                        n_classes = len(np.unique(index))
+
+                    # create and open RasterRow objects for probabilities
+                    prob_out_raster = [0] * n_classes
+                    prob = [0] * n_classes
+                    for iclass, label in enumerate(index):
+                        prob_out_raster[iclass] = output + '_classPr' + str(label)
+                        prob[iclass] = RasterRow(prob_out_raster[iclass])
+                        prob[iclass].open('w', 'FCELL', overwrite=True)
+
+                for iclass, label in enumerate(index):
+                    result_proba_class = result_proba[:, label]
+                    result_proba_class = result_proba_class.reshape((rowincr, current.cols))
+                    result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
+
+                    for row in range(rowincr):
+                        newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+                        newrow[:] = result_proba_class[row, :]
+                        prob[iclass].put_row(newrow)
+
+        # close maps
+        for i in range(n_features):
+            rasstack[i].close()
+
+    else:
+
+        # ---------------------------------------------------------------------
+        # parallel prediction
+        # ---------------------------------------------------------------------
+
+        # create lists of row increments
+        row_mins, row_maxs = [], []
+        for row in range(0, current.rows, rowincr):
+            if row+rowincr > current.rows:
+                rowincr = current.rows - row
+            row_mins.append(row)
+            row_maxs.append(row+rowincr)
+
+        # perform predictions on lists of row increments in parallel
+        predictions = Parallel(n_jobs=n_jobs, max_nbytes=None)(
+            delayed(__predict_parallel)
+            (estimator, predictors, predict_type, current, row_min, row_max)
+            for row_min, row_max in zip(row_mins, row_maxs))
+
+        # unpack the results
+        results = []
+        ftypes = []
+        for block in predictions:
+            results.append(block[0])
+            ftypes.append(block[1])
+
+        # -------------------------------------------------------------------------
+        #  writing of predicted results for classification
+        # -------------------------------------------------------------------------
+        if predict_type == 'raw':
+            classification = RasterRow(output)
+            classification.open('w', ftypes[0], overwrite=True)
+
+            # write the classification result
+            for result_block in results:
+                for row in range(result_block.shape[0]):
+                    newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
+                    newrow[:] = result_block[row, :]
+                    classification.put_row(newrow)
+
+        # -------------------------------------------------------------------------
+        # writing of predicted results for probabilities
+        # -------------------------------------------------------------------------
+        if predict_type == 'prob':
+            # determine number of classes
+            if index is None:
+                index = range(results[0].shape[2])
+                n_classes = len(index)
+            else:
+                n_classes = len(np.unique(index))
+
+            # create and open RasterRow objects for probabilities
+            prob_out_raster = [0] * n_classes
+            prob = [0] * n_classes
+            for iclass, label in enumerate(index):
+                prob_out_raster[iclass] = output + '_classPr' + str(label)
+                prob[iclass] = RasterRow(prob_out_raster[iclass])
+                prob[iclass].open('w', 'FCELL', overwrite=True)
+
+            # write the class probability results
+            for results_proba_block in results:
+                for iclass, label in enumerate(index):
+                    result_proba_class = results_proba_block[:, :, label]
+
+                    for row in range(result_proba_class.shape[0]):
+                        newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+                        newrow[:] = result_proba_class[row, :]
+                        prob[iclass].put_row(newrow)
+
+    # -------------------------------------------------------------------------
+    # close all maps
+    # -------------------------------------------------------------------------
+    if predict_type == 'raw': classification.close()
+    if predict_type == 'prob':
+        try:
+            for iclass in range(n_classes):
+                prob[iclass].close()
+        except:
+            pass
+
+
+def __predict_parallel(estimator, predictors, predict_type, current, row_min, row_max):
+    """
+    Performs prediction on range of rows in grass rasters
+
+    Args
+    ----
+    estimator: scikit-learn estimator object
+    predictors: list of GRASS rasters
+    predict_type: character, 'raw' for classification/regression;
+                  'prob' for class probabilities
+    current: current region settings
+    row_min, row_max: Range of rows of grass rasters to perform predictions
+
+    Returns
+    -------
+    result: 2D (classification) or 3D numpy array (class probabilities) of predictions
+    ftypes: data storage type
+    """
+
+    # initialize output
+    result, ftype, mask = None, None, None
+
+    # open grass rasters
+    n_features = len(predictors)
+    rasstack = [0] * n_features
+
+    for i in range(n_features):
+        rasstack[i] = RasterRow(predictors[i])
+        if rasstack[i].exist() is True:
+            rasstack[i].open('r')
+        else:
+            gscript.fatal("GRASS raster " + predictors[i] +
+                          " does not exist.... exiting")
+
+    # loop through each row, and each band and add to 2D img_np_row
+    img_np_row = np.zeros((row_max-row_min, current.cols, n_features))
+    for row in range(row_min, row_max):
+        for band in range(n_features):
+            img_np_row[row-row_min, :, band] = np.array(rasstack[band][row])
+
+    # create mask
+    img_np_row[img_np_row == -2147483648] = np.nan
+    mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+    for feature in range(n_features):
+        invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+        mask[invalid_indexes] = np.nan
+
+    # reshape each row-band matrix into a n*m array
+    nsamples = (row_max-row_min) * current.cols
+    flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+    # remove NaNs prior to passing to scikit-learn predict
+    flat_pixels = np.nan_to_num(flat_pixels)
+
+    # perform prediction for classification/regression
+    if predict_type == 'raw':
+        result = estimator.predict(flat_pixels)
+        result = result.reshape((row_max-row_min, current.cols))
+
+        # determine nodata value and grass raster type
+        if result.dtype == 'float':
+            nodata = np.nan
+            ftype = 'FCELL'
+        else:
+            nodata = -2147483648
+            ftype = 'CELL'
+
+        # replace NaN values so that the prediction does not have a border
+        result[np.nonzero(np.isnan(mask))] = nodata
+
+    # perform prediction for class probabilities
+    if predict_type == 'prob':
+        result = estimator.predict_proba(flat_pixels)
+        result = result.reshape((row_max-row_min, current.cols, result.shape[1]))
+        result[np.nonzero(np.isnan(mask))] = np.nan
+
+    # close maps
+    for i in range(n_features):
+        rasstack[i].close()
+
+    return (result, ftype)
+

Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py	2017-06-01 10:01:54 UTC (rev 71156)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_utils.py	2017-06-01 22:03:12 UTC (rev 71157)
@@ -1,282 +1,282 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_utils contains functinons to assist
-with passing pre-defined scikit learn classifiers
-and other utilities for loading/saving training data.
-
-"""
-
-from __future__ import absolute_import
-from subprocess import PIPE
-
-import numpy as np
-import os
-from copy import deepcopy
-
-import grass.script as gscript
-from grass.pygrass.modules.shortcuts import imagery as im
-
-def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
-    """
-    Provides the classifiers and parameters using by the module
-
-    Args
-    ----
-    estimator: Name of estimator
-    random_state: Seed to use in randomized components
-    n_jobs: Integer, number of processing cores to use
-    p: Dict, containing classifier setttings
-    weights: None, or 'balanced' to add class_weights
-
-    Returns
-    -------
-    clf: Scikit-learn classifier object
-    mode: Flag to indicate whether classifier performs classification or
-          regression
-    """
-
-    from sklearn.linear_model import LogisticRegression
-    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
-    from sklearn.naive_bayes import GaussianNB
-    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-    from sklearn.ensemble import (
-        RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
-        ExtraTreesRegressor)
-    from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
-    from sklearn.svm import SVC
-    from sklearn.neighbors import KNeighborsClassifier
-
-    # convert balanced boolean to scikit learn method
-    if weights is True:
-        weights = 'balanced'
-    else: weights = None
-
-    # optional packages that add additional classifiers here
-    if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
-        try:
-            from sklearn.pipeline import Pipeline
-            from pyearth import Earth
-
-            earth_classifier = Pipeline([('Earth',
-                                          Earth(max_degree=p['max_degree'])),
-                                         ('Logistic', LogisticRegression(n_jobs=n_jobs))])
-
-            classifiers = {'EarthClassifier': earth_classifier,
-                           'EarthRegressor': Earth(max_degree=p['max_degree'])}
-        except:
-            gscript.fatal('Py-earth package not installed')
-
-    elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
-        try:
-            from xgboost import XGBClassifier, XGBRegressor
-
-            if p['max_depth'] is None:
-                p['max_depth'] = int(3)
-
-            classifiers = {
-                'XGBClassifier':
-                    XGBClassifier(learning_rate=p['learning_rate'],
-                                  n_estimators=p['n_estimators'],
-                                  max_depth=p['max_depth'],
-                                  subsample=p['subsample'],
-                                  nthread=n_jobs),
-                'XGBRegressor':
-                    XGBRegressor(learning_rate=p['learning_rate'],
-                                 n_estimators=p['n_estimators'],
-                                 max_depth=p['max_depth'],
-                                 subsample=p['subsample'],
-                                 nthread=n_jobs)}
-        except:
-            gscript.fatal('XGBoost package not installed')
-    else:
-        # core sklearn classifiers go here
-        classifiers = {
-            'SVC': SVC(C=p['C'],
-                       class_weight=weights,
-                       probability=True,
-                       random_state=random_state),
-            'LogisticRegression':
-                LogisticRegression(C=p['C'],
-                                   class_weight=weights,
-                                   random_state=random_state,
-                                   n_jobs=n_jobs,
-                                   fit_intercept=True),
-            'DecisionTreeClassifier':
-                DecisionTreeClassifier(max_depth=p['max_depth'],
-                                       max_features=p['max_features'],
-                                       min_samples_split=p['min_samples_split'],
-                                       min_samples_leaf=p['min_samples_leaf'],
-                                       class_weight=weights,
-                                       random_state=random_state),
-            'DecisionTreeRegressor':
-                DecisionTreeRegressor(max_features=p['max_features'],
-                                      min_samples_split=p['min_samples_split'],
-                                      min_samples_leaf=p['min_samples_leaf'],
-                                      random_state=random_state),
-            'RandomForestClassifier':
-                RandomForestClassifier(n_estimators=p['n_estimators'],
-                                       max_features=p['max_features'],
-                                       min_samples_split=p['min_samples_split'],
-                                       min_samples_leaf=p['min_samples_leaf'],
-                                       class_weight=weights,
-                                       random_state=random_state,
-                                       n_jobs=n_jobs,
-                                       oob_score=False),
-            'RandomForestRegressor':
-                RandomForestRegressor(n_estimators=p['n_estimators'],
-                                      max_features=p['max_features'],
-                                      min_samples_split=p['min_samples_split'],
-                                      min_samples_leaf=p['min_samples_leaf'],
-                                      random_state=random_state,
-                                      n_jobs=n_jobs,
-                                      oob_score=False),
-            'ExtraTreesClassifier':
-                ExtraTreesClassifier(n_estimators=p['n_estimators'],
-                                     max_features=p['max_features'],
-                                     min_samples_split=p['min_samples_split'],
-                                     min_samples_leaf=p['min_samples_leaf'],
-                                     class_weight=weights,
-                                     random_state=random_state,
-                                     n_jobs=n_jobs,
-                                     oob_score=False),
-            'ExtraTreesRegressor':
-                ExtraTreesRegressor(n_estimators=p['n_estimators'],
-                                    max_features=p['max_features'],
-                                    min_samples_split=p['min_samples_split'],
-                                    min_samples_leaf=p['min_samples_leaf'],
-                                    random_state=random_state,
-                                    n_jobs=n_jobs,
-                                    oob_score=False),
-            'GradientBoostingClassifier':
-                GradientBoostingClassifier(learning_rate=p['learning_rate'],
-                                           n_estimators=p['n_estimators'],
-                                           max_depth=p['max_depth'],
-                                           min_samples_split=p['min_samples_split'],
-                                           min_samples_leaf=p['min_samples_leaf'],
-                                           subsample=p['subsample'],
-                                           max_features=p['max_features'],
-                                           random_state=random_state),
-            'GradientBoostingRegressor':
-                GradientBoostingRegressor(learning_rate=p['learning_rate'],
-                                          n_estimators=p['n_estimators'],
-                                          max_depth=p['max_depth'],
-                                          min_samples_split=p['min_samples_split'],
-                                          min_samples_leaf=p['min_samples_leaf'],
-                                          subsample=p['subsample'],
-                                          max_features=p['max_features'],
-                                          random_state=random_state),
-            'GaussianNB': GaussianNB(),
-            'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
-            'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
-            'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=p['n_neighbors'],
-                                                         weights=p['weights'],
-                                                         n_jobs=n_jobs)
-        }
-
-    # define classifier
-    clf = classifiers[estimator]
-
-    # classification or regression
-    if estimator == 'LogisticRegression' \
-        or estimator == 'DecisionTreeClassifier' \
-        or estimator == 'RandomForestClassifier' \
-        or estimator == 'ExtraTreesClassifier' \
-        or estimator == 'GradientBoostingClassifier' \
-        or estimator == 'GaussianNB' \
-        or estimator == 'LinearDiscriminantAnalysis' \
-        or estimator == 'QuadraticDiscriminantAnalysis' \
-        or estimator == 'EarthClassifier' \
-        or estimator == 'XGBClassifier' \
-        or estimator == 'SVC' \
-        or estimator == 'KNeighborsClassifier':
-        mode = 'classification'
-    else:
-        mode = 'regression'
-
-    return (clf, mode)
-
-
-def save_training_data(X, y, groups, coords, file):
-    """
-    Saves any extracted training data to a csv file
-
-    Args
-    ----
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-    groups: Numpy array of group labels
-    coords: Numpy array containing xy coordinates of samples
-    file: Path to a csv file to save data to
-    """
-
-    # if there are no group labels, create a nan filled array
-    if groups is None:
-        groups = np.empty((y.shape[0]))
-        groups[:] = np.nan
-
-    training_data = np.column_stack([coords, X, y, groups])
-    np.savetxt(file, training_data, delimiter=',')
-
-
-def load_training_data(file):
-    """
-    Loads training data and labels from a csv file
-
-    Args
-    ----
-    file: Path to a csv file to save data to
-
-    Returns
-    -------
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-    groups: Numpy array of group labels, or None
-    coords: Numpy array containing x,y coordinates of samples
-    """
-
-    training_data = np.loadtxt(file, delimiter=',')
-    n_cols = training_data.shape[1]
-    last_Xcol = n_cols-2
-
-    # check to see if last column contains group labels or nans
-    groups = training_data[:, -1]
-
-    # if all nans then set groups to None
-    if bool(np.isnan(groups).all()) is True:
-        groups = None
-
-    # fetch X and y
-    coords = training_data[:, 0:2]
-    X = training_data[:, 2:last_Xcol]
-    y = training_data[:, -2]
-
-    return(X, y, groups, coords)
-
-
-def maps_from_group(group):
-    """
-    Parse individual rasters into a list from an imagery group
-
-    Args
-    ----
-    group: String; GRASS imagery group
-
-    Returns
-    -------
-    maplist: List containing individual GRASS raster maps
-    map_names: List with print friendly map names
-    """
-    groupmaps = im.group(group=group, flags="g",
-                         quiet=True, stdout_=PIPE).outputs.stdout
-
-    maplist = groupmaps.split(os.linesep)
-    maplist = maplist[0:len(maplist)-1]
-    map_names = []
-
-    for rastername in maplist:
-        map_names.append(rastername.split('@')[0])
-
-    return(maplist, map_names)
+#!/usr/bin/env python
+# -- coding: utf-8 --
+
+"""
+The module rlearn_utils contains functinons to assist
+with passing pre-defined scikit learn classifiers
+and other utilities for loading/saving training data.
+
+"""
+
+from __future__ import absolute_import
+from subprocess import PIPE
+
+import numpy as np
+import os
+from copy import deepcopy
+
+import grass.script as gscript
+from grass.pygrass.modules.shortcuts import imagery as im
+
+def model_classifiers(estimator, random_state, n_jobs, p, weights=None):
+    """
+    Provides the classifiers and parameters using by the module
+
+    Args
+    ----
+    estimator: Name of estimator
+    random_state: Seed to use in randomized components
+    n_jobs: Integer, number of processing cores to use
+    p: Dict, containing classifier setttings
+    weights: None, or 'balanced' to add class_weights
+
+    Returns
+    -------
+    clf: Scikit-learn classifier object
+    mode: Flag to indicate whether classifier performs classification or
+          regression
+    """
+
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+    from sklearn.naive_bayes import GaussianNB
+    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+    from sklearn.ensemble import (
+        RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier,
+        ExtraTreesRegressor)
+    from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
+    from sklearn.svm import SVC
+    from sklearn.neighbors import KNeighborsClassifier
+
+    # convert balanced boolean to scikit learn method
+    if weights is True:
+        weights = 'balanced'
+    else: weights = None
+
+    # optional packages that add additional classifiers here
+    if estimator == 'EarthClassifier' or estimator == 'EarthRegressor':
+        try:
+            from sklearn.pipeline import Pipeline
+            from pyearth import Earth
+
+            earth_classifier = Pipeline([('Earth',
+                                          Earth(max_degree=p['max_degree'])),
+                                         ('Logistic', LogisticRegression(n_jobs=n_jobs))])
+
+            classifiers = {'EarthClassifier': earth_classifier,
+                           'EarthRegressor': Earth(max_degree=p['max_degree'])}
+        except:
+            gscript.fatal('Py-earth package not installed')
+
+    elif estimator == 'XGBClassifier' or estimator == 'XGBRegressor':
+        try:
+            from xgboost import XGBClassifier, XGBRegressor
+
+            if p['max_depth'] is None:
+                p['max_depth'] = int(3)
+
+            classifiers = {
+                'XGBClassifier':
+                    XGBClassifier(learning_rate=p['learning_rate'],
+                                  n_estimators=p['n_estimators'],
+                                  max_depth=p['max_depth'],
+                                  subsample=p['subsample'],
+                                  nthread=n_jobs),
+                'XGBRegressor':
+                    XGBRegressor(learning_rate=p['learning_rate'],
+                                 n_estimators=p['n_estimators'],
+                                 max_depth=p['max_depth'],
+                                 subsample=p['subsample'],
+                                 nthread=n_jobs)}
+        except:
+            gscript.fatal('XGBoost package not installed')
+    else:
+        # core sklearn classifiers go here
+        classifiers = {
+            'SVC': SVC(C=p['C'],
+                       class_weight=weights,
+                       probability=True,
+                       random_state=random_state),
+            'LogisticRegression':
+                LogisticRegression(C=p['C'],
+                                   class_weight=weights,
+                                   random_state=random_state,
+                                   n_jobs=n_jobs,
+                                   fit_intercept=True),
+            'DecisionTreeClassifier':
+                DecisionTreeClassifier(max_depth=p['max_depth'],
+                                       max_features=p['max_features'],
+                                       min_samples_split=p['min_samples_split'],
+                                       min_samples_leaf=p['min_samples_leaf'],
+                                       class_weight=weights,
+                                       random_state=random_state),
+            'DecisionTreeRegressor':
+                DecisionTreeRegressor(max_features=p['max_features'],
+                                      min_samples_split=p['min_samples_split'],
+                                      min_samples_leaf=p['min_samples_leaf'],
+                                      random_state=random_state),
+            'RandomForestClassifier':
+                RandomForestClassifier(n_estimators=p['n_estimators'],
+                                       max_features=p['max_features'],
+                                       min_samples_split=p['min_samples_split'],
+                                       min_samples_leaf=p['min_samples_leaf'],
+                                       class_weight=weights,
+                                       random_state=random_state,
+                                       n_jobs=n_jobs,
+                                       oob_score=False),
+            'RandomForestRegressor':
+                RandomForestRegressor(n_estimators=p['n_estimators'],
+                                      max_features=p['max_features'],
+                                      min_samples_split=p['min_samples_split'],
+                                      min_samples_leaf=p['min_samples_leaf'],
+                                      random_state=random_state,
+                                      n_jobs=n_jobs,
+                                      oob_score=False),
+            'ExtraTreesClassifier':
+                ExtraTreesClassifier(n_estimators=p['n_estimators'],
+                                     max_features=p['max_features'],
+                                     min_samples_split=p['min_samples_split'],
+                                     min_samples_leaf=p['min_samples_leaf'],
+                                     class_weight=weights,
+                                     random_state=random_state,
+                                     n_jobs=n_jobs,
+                                     oob_score=False),
+            'ExtraTreesRegressor':
+                ExtraTreesRegressor(n_estimators=p['n_estimators'],
+                                    max_features=p['max_features'],
+                                    min_samples_split=p['min_samples_split'],
+                                    min_samples_leaf=p['min_samples_leaf'],
+                                    random_state=random_state,
+                                    n_jobs=n_jobs,
+                                    oob_score=False),
+            'GradientBoostingClassifier':
+                GradientBoostingClassifier(learning_rate=p['learning_rate'],
+                                           n_estimators=p['n_estimators'],
+                                           max_depth=p['max_depth'],
+                                           min_samples_split=p['min_samples_split'],
+                                           min_samples_leaf=p['min_samples_leaf'],
+                                           subsample=p['subsample'],
+                                           max_features=p['max_features'],
+                                           random_state=random_state),
+            'GradientBoostingRegressor':
+                GradientBoostingRegressor(learning_rate=p['learning_rate'],
+                                          n_estimators=p['n_estimators'],
+                                          max_depth=p['max_depth'],
+                                          min_samples_split=p['min_samples_split'],
+                                          min_samples_leaf=p['min_samples_leaf'],
+                                          subsample=p['subsample'],
+                                          max_features=p['max_features'],
+                                          random_state=random_state),
+            'GaussianNB': GaussianNB(),
+            'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+            'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+            'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=p['n_neighbors'],
+                                                         weights=p['weights'],
+                                                         n_jobs=n_jobs)
+        }
+
+    # define classifier
+    clf = classifiers[estimator]
+
+    # classification or regression
+    if estimator == 'LogisticRegression' \
+        or estimator == 'DecisionTreeClassifier' \
+        or estimator == 'RandomForestClassifier' \
+        or estimator == 'ExtraTreesClassifier' \
+        or estimator == 'GradientBoostingClassifier' \
+        or estimator == 'GaussianNB' \
+        or estimator == 'LinearDiscriminantAnalysis' \
+        or estimator == 'QuadraticDiscriminantAnalysis' \
+        or estimator == 'EarthClassifier' \
+        or estimator == 'XGBClassifier' \
+        or estimator == 'SVC' \
+        or estimator == 'KNeighborsClassifier':
+        mode = 'classification'
+    else:
+        mode = 'regression'
+
+    return (clf, mode)
+
+
+def save_training_data(X, y, groups, coords, file):
+    """
+    Saves any extracted training data to a csv file
+
+    Args
+    ----
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    groups: Numpy array of group labels
+    coords: Numpy array containing xy coordinates of samples
+    file: Path to a csv file to save data to
+    """
+
+    # if there are no group labels, create a nan filled array
+    if groups is None:
+        groups = np.empty((y.shape[0]))
+        groups[:] = np.nan
+
+    training_data = np.column_stack([coords, X, y, groups])
+    np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+    """
+    Loads training data and labels from a csv file
+
+    Args
+    ----
+    file: Path to a csv file to save data to
+
+    Returns
+    -------
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    groups: Numpy array of group labels, or None
+    coords: Numpy array containing x,y coordinates of samples
+    """
+
+    training_data = np.loadtxt(file, delimiter=',')
+    n_cols = training_data.shape[1]
+    last_Xcol = n_cols-2
+
+    # check to see if last column contains group labels or nans
+    groups = training_data[:, -1]
+
+    # if all nans then set groups to None
+    if bool(np.isnan(groups).all()) is True:
+        groups = None
+
+    # fetch X and y
+    coords = training_data[:, 0:2]
+    X = training_data[:, 2:last_Xcol]
+    y = training_data[:, -2]
+
+    return(X, y, groups, coords)
+
+
+def maps_from_group(group):
+    """
+    Parse individual rasters into a list from an imagery group
+
+    Args
+    ----
+    group: String; GRASS imagery group
+
+    Returns
+    -------
+    maplist: List containing individual GRASS raster maps
+    map_names: List with print friendly map names
+    """
+    groupmaps = im.group(group=group, flags="g",
+                         quiet=True, stdout_=PIPE).outputs.stdout
+
+    maplist = groupmaps.split(os.linesep)
+    maplist = maplist[0:len(maplist)-1]
+    map_names = []
+
+    for rastername in maplist:
+        map_names.append(rastername.split('@')[0])
+
+    return(maplist, map_names)