[GRASS-SVN] r71986 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Wed Dec 27 22:25:30 PST 2017
Author: spawley
Date: 2017-12-27 22:25:30 -0800 (Wed, 27 Dec 2017)
New Revision: 71986
Removed:
grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
Log:
r.learn.ml deleted file rlearn_crossval.py
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-12-28 06:19:41 UTC (rev 71985)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-12-28 06:25:30 UTC (rev 71986)
@@ -417,12 +417,387 @@
from grass.pygrass.modules.shortcuts import raster as r
gs.utils.set_path(modulename='r.learn.ml')
-from rlearn_crossval import cross_val_scores
from rlearn_sampling import extract_pixels, extract_points
from rlearn_prediction import predict
from rlearn_utils import (
model_classifiers, save_training_data, load_training_data, maps_from_group)
+
+def specificity_score(y_true, y_pred):
+ """
+ Calculate specificity score
+
+ Args
+ ----
+ y_true (1d numpy array): true values of class labels
+ y_pred (1d numpy array): predicted class labels
+
+ Returns
+ -------
+ specificity (float): specificity score
+ """
+
+ from sklearn.metrics import confusion_matrix
+
+ cm = confusion_matrix(y_true, y_pred)
+ tn = float(cm[0][0])
+ fp = float(cm[0][1])
+
+ return tn/(tn+fp)
+
+
+def varimp_permutation(estimator, X, y, n_permutations, scorer,
+ n_jobs, random_state):
+ """
+ Method to perform permutation-based feature importance during
+ cross-validation (cross-validation is applied externally to this
+ method)
+
+ Procedure is:
+ 1. Pass fitted estimator and test partition X y
+ 2. Assess AUC on the test partition (bestauc)
+ 3. Permute each variable and assess the difference between bestauc and
+ the messed-up variable
+ 4. Repeat (3) for many random permutations
+ 5. Average the repeats
+
+ Args
+ ----
+ estimator (object): estimator that has been fitted to a training partition
+ X, y: 2d and 1d numpy arrays of data and labels from a test partition
+ n_permutations (integer): number of random permutations to apply
+ scorer (object): scikit-learn metric function to use
+ n_jobs (integer): integer, number of processing cores
+ random_state (float): seed to pass to the numpy random.seed
+
+ Returns
+ -------
+ scores (2d numpy array): scores for each predictor following permutation
+ """
+
+ from sklearn.externals.joblib import Parallel, delayed
+
+ # calculate score on original variables without permutation
+ # determine best metric type for binary/multiclass/regression scenarios
+ y_pred = estimator.predict(X)
+ best_score = scorer(y, y_pred)
+
+ # repeated permutations and return difference from best score per predictor
+ scores = Parallel(n_jobs=n_jobs)(
+ delayed(__permute)(
+ estimator, X, y, best_score, scorer, random_state)
+ for n in range(n_permutations))
+
+ # average the repetitions
+ scores = np.asarray(scores)
+ scores = scores.mean(axis=0)
+
+ return scores
+
+
+def __permute(estimator, X, y, best_score, scorer, random_state):
+ """
+ Permute each predictor and measure difference from best score
+
+ Args
+ ----
+ estimator (object): scikit learn estimator
+ X, y: 2d and 1d numpy arrays data and labels from a test partition
+ best_score (float): best scorer obtained on unperturbed data
+ scorer (object): scoring method to use to measure importances
+ random_state (float): random seed
+
+ Returns
+ -------
+ scores (2D numpy array): scores for each predictor following permutation
+ """
+
+ from numpy.random import RandomState
+ rstate = RandomState(random_state)
+
+ # permute each predictor variable and assess difference in score
+ scores = np.zeros(X.shape[1])
+
+ for i in range(X.shape[1]):
+ Xscram = np.copy(X)
+ Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])
+
+ # fit the model on the training data and predict the test data
+ y_pred = estimator.predict(Xscram)
+ scores[i] = best_score-scorer(y, y_pred)
+ if scores[i] < 0:
+ scores[i] = 0
+
+ return scores
+
+
+def __parallel_fit(estimator, X, y, groups, train_indices, sample_weight):
+ """
+ Fit classifiers/regressors in parallel
+
+ Args
+ ----
+ estimator (object): scikit learn estimator
+ X, y: 2D and 1D numpy arrays of training data and labels
+ groups (1D numpy array): of len(y) containing group labels
+ train_indices, test_indices: 1D numpy arrays of indices to use for
+ training/validation
+ sample_weight (1D numpy array): of len(y) containing weights to use during
+ fitting
+ """
+ from sklearn.pipeline import Pipeline
+
+ rs_estimator = deepcopy(estimator)
+
+ # create training and test folds
+ X_train, y_train = X[train_indices], y[train_indices]
+
+ if groups is not None:
+ groups_train = groups[train_indices]
+ else:
+ groups_train = None
+
+ # subset training and test fold sample_weight
+ if sample_weight is not None:
+ weights = sample_weight[train_indices]
+
+ # specify fit_params for sample_weights if required
+ if isinstance(estimator, Pipeline) and sample_weight is not None:
+ fit_params = {'classifier__sample_weight': weights}
+ elif not isinstance(estimator, Pipeline) and sample_weight is not None:
+ fit_params = {'sample_weight': weights}
+ else:
+ fit_params = {}
+
+ # fit estimator with/without groups
+ if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
+ rs_estimator.fit(X_train, y_train, groups=groups_train, **fit_params)
+ else:
+ rs_estimator.fit(X_train, y_train, **fit_params)
+
+ return rs_estimator
+
+
+def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
+ scoring='accuracy', feature_importances=False,
+ n_permutations=25, random_state=None, n_jobs=-1):
+ """
+ Stratified Kfold and GroupFold cross-validation using multiple
+ scoring metrics and permutation feature importances
+
+ Args
+ ----
+ estimator (object): Scikit learn estimator
+ X, y: 2D and 1D numpy array of training data and labels
+ groups (1D numpy array): group labels
+ sample_weight (1D numpy array[n_samples,]): sample weights per sample
+ cv (integer or object): Number of cross-validation folds or
+ sklearn.model_selection object
+ scoring (list): List of performance metrics to use
+ feature_importances (boolean): option to perform permutation-based importances
+ n_permutations (integer): Number of permutations during feature importance
+ random_state (float): Seed to pass to the random number generator
+
+ Returns
+ -------
+ scores (dict): Containing lists of scores per cross-validation fold
+ byclass_scores (dict): Containing scores per class
+ fimp (2D numpy array): permutation feature importances per feature
+ clf_resamples (list): List of fitted estimators
+ predictions (2d numpy array): with y_true, y_pred, fold
+ """
+
+ from sklearn import metrics
+ from sklearn.model_selection import StratifiedKFold
+ from sklearn.externals.joblib import Parallel, delayed
+
+ # first unwrap the estimator from any potential pipelines or gridsearchCV
+ if type(estimator).__name__ == 'Pipeline':
+ clf_type = estimator.named_steps['classifier']
+ else:
+ clf_type = estimator
+
+ if type(clf_type).__name__ == 'GridSearchCV' or \
+ type(clf_type).__name__ == 'RandomizedSearchCV':
+ clf_type = clf_type.best_estimator_
+
+ # check name against already multithreaded classifiers
+ if type(clf_type).__name__ in [
+ 'RandomForestClassifier',
+ 'RandomForestRegressor',
+ 'ExtraTreesClassifier',
+ 'ExtraTreesRegressor',
+ 'KNeighborsClassifier']:
+ n_jobs=1
+
+ # -------------------------------------------------------------------------
+ # create copies of estimator and create cross-validation iterator
+ # -------------------------------------------------------------------------
+
+ # deepcopy estimator
+ clf = deepcopy(estimator)
+
+ # create model_selection method
+ if isinstance(cv, int):
+ cv = StratifiedKFold(n_splits=cv)
+
+ # -------------------------------------------------------------------------
+ # create dictionary of lists to store metrics
+ # -------------------------------------------------------------------------
+
+ if isinstance(scoring, basestring):
+ scoring = [scoring]
+ scores = dict.fromkeys(scoring)
+ scores = {key: [] for key, value in scores.iteritems()}
+ scoring_methods = {'accuracy': metrics.accuracy_score,
+ 'balanced_accuracy': metrics.recall_score,
+ 'average_precision': metrics.average_precision_score,
+ 'brier_loss': metrics.brier_score_loss,
+ 'kappa': metrics.cohen_kappa_score,
+ 'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'hamming_loss': metrics.hamming_loss,
+ 'jaccard_similarity': metrics.jaccard_similarity_score,
+ 'log_loss': metrics.log_loss,
+ 'matthews_corrcoef': metrics.matthews_corrcoef,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score,
+ 'specificity': specificity_score,
+ 'roc_auc': metrics.roc_auc_score,
+ 'zero_one_loss': metrics.zero_one_loss,
+ 'r2': metrics.r2_score,
+ 'explained_variance': metrics.explained_variance_score,
+ 'neg_mean_absolute_error': metrics.mean_absolute_error,
+ 'neg_mean_squared_error': metrics.mean_squared_error,
+ 'neg_mean_squared_log_error': metrics.mean_squared_log_error,
+ 'neg_median_absolute_error': metrics.median_absolute_error}
+
+ byclass_methods = {'f1': metrics.f1_score,
+ 'fbeta': metrics.fbeta_score,
+ 'precision': metrics.precision_score,
+ 'recall': metrics.recall_score}
+
+ # create dict to store byclass metrics results
+ n_classes = len(np.unique(y))
+ labels = np.unique(y)
+ byclass_scores = dict.fromkeys(byclass_methods)
+ byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
+
+ # remove any byclass_scorers that are not in the scoring list
+ byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
+
+ # check if remaining scorers are valid sklearn metrics
+ for i in scores.keys():
+ try:
+ list(scoring_methods.keys()).index(i)
+ except:
+ gs.fatal(('Scoring ', i, ' is not a valid scoring method',
+ os.linesep, 'Valid methods are: ', scoring_methods.keys()))
+
+ # set averaging type for global binary or multiclass scores
+ if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+ average = 'binary'
+ else:
+ average = 'macro'
+
+ # create np array to store feature importance scores
+ if feature_importances is True:
+ fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
+ fimp[:] = np.nan
+ else:
+ fimp = None
+
+ # -------------------------------------------------------------------------
+ # extract cross-validation indices
+ # -------------------------------------------------------------------------
+
+ if groups is None:
+ k_fold = cv.split(X, y)
+ else:
+ k_fold = cv.split(X, y, groups=groups)
+
+ trains, tests = [], []
+ for train_indices, test_indices in k_fold:
+ trains.append(train_indices)
+ tests.append(test_indices)
+
+ # -------------------------------------------------------------------------
+ # Perform multiprocessing fitting of clf on each fold
+ # -------------------------------------------------------------------------
+ clf_resamples = Parallel(n_jobs=n_jobs)(
+ delayed(__parallel_fit)(clf, X, y, groups, train_indices, sample_weight)
+ for train_indices in trains)
+
+ # -------------------------------------------------------------------------
+ # loop through each fold and calculate performance metrics
+ # -------------------------------------------------------------------------
+
+ # store predictions and indices
+ predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
+
+ fold = 0
+ for train_indices, test_indices in zip(trains, tests):
+
+ # create training and test folds
+ X_test, y_test = X[test_indices], y[test_indices]
+
+ # prediction of test fold
+ y_pred = clf_resamples[fold].predict(X_test)
+ predictions[test_indices, 0] = y_test
+ predictions[test_indices, 1] = y_pred
+ predictions[test_indices, 2] = fold
+
+ # calculate global performance metrics
+ for m in scores.keys():
+ # metrics that require probabilties
+ if m == 'brier_loss' or m == 'roc_auc':
+ y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_prob))
+
+ # metrics that have no averaging for multiclass
+ elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
+ or m == 'hamming_loss' or m == 'jaccard_similarity' \
+ or m == 'log_loss' or m == 'zero_one_loss' \
+ or m == 'matthews_corrcoef' \
+ or m == 'r2' \
+ or m == 'explained_variance' \
+ or m == 'neg_mean_absolute_error' \
+ or m == 'neg_mean_squared_error' \
+ or m == 'neg_mean_squared_log_error' \
+ or m == 'neg_median_absolute_error':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](y_test, y_pred))
+
+ # balanced accuracy
+ elif m == 'balanced_accuracy':
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average='macro'))
+
+ # metrics that have averaging for multiclass
+ else:
+ scores[m] = np.append(
+ scores[m], scoring_methods[m](
+ y_test, y_pred, average=average))
+
+ # calculate per-class performance metrics
+ for key in byclass_scores.keys():
+ byclass_scores[key] = np.vstack((
+ byclass_scores[key], byclass_methods[key](
+ y_test, y_pred, labels=labels, average=None)))
+
+ # feature importances using permutation
+ if feature_importances is True:
+ fimp[fold, :] = varimp_permutation(
+ clf_resamples[fold], X_test, y_test, n_permutations,
+ scoring_methods[scoring[0]], n_jobs, random_state)
+ fold += 1
+
+ return(scores, byclass_scores, fimp, clf_resamples, predictions)
+
+
+
tmp_rast = []
def cleanup():
Deleted: grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py 2017-12-28 06:19:41 UTC (rev 71985)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py 2017-12-28 06:25:30 UTC (rev 71986)
@@ -1,397 +0,0 @@
-#!/usr/bin/env python
-# -- coding: utf-8 --
-
-"""
-The module rlearn_crossval contains functions to perform
-model validation and permutation feature importances.
-"""
-
-from __future__ import absolute_import
-from copy import deepcopy
-import numpy as np
-import os
-from numpy.random import RandomState
-import grass.script as gs
-
-
-def specificity_score(y_true, y_pred):
- """
-
- Calculate specificity score
-
- Args
- ----
- y_true (1d numpy array): true values of class labels
- y_pred (1d numpy array): predicted class labels
-
- Returns
- -------
- specificity (float): specificity score
-
- """
-
- from sklearn.metrics import confusion_matrix
-
- cm = confusion_matrix(y_true, y_pred)
- tn = float(cm[0][0])
- fp = float(cm[0][1])
-
- return tn/(tn+fp)
-
-
-def varimp_permutation(estimator, X, y, n_permutations, scorer,
- n_jobs, random_state):
- """
-
- Method to perform permutation-based feature importance during
- cross-validation (cross-validation is applied externally to this
- method)
-
- Procedure is:
- 1. Pass fitted estimator and test partition X y
- 2. Assess AUC on the test partition (bestauc)
- 3. Permute each variable and assess the difference between bestauc and
- the messed-up variable
- 4. Repeat (3) for many random permutations
- 5. Average the repeats
-
- Args
- ----
- estimator (object): estimator that has been fitted to a training partition
- X, y: 2d and 1d numpy arrays of data and labels from a test partition
- n_permutations (integer): number of random permutations to apply
- scorer (object): scikit-learn metric function to use
- n_jobs (integer): integer, number of processing cores
- random_state (float): seed to pass to the numpy random.seed
-
- Returns
- -------
- scores (2d numpy array): scores for each predictor following permutation
-
- """
-
- from sklearn.externals.joblib import Parallel, delayed
-
- # calculate score on original variables without permutation
- # determine best metric type for binary/multiclass/regression scenarios
- y_pred = estimator.predict(X)
- best_score = scorer(y, y_pred)
-
- # repeated permutations and return difference from best score per predictor
- scores = Parallel(n_jobs=n_jobs)(
- delayed(__permute)(
- estimator, X, y, best_score, scorer, random_state)
- for n in range(n_permutations))
-
- # average the repetitions
- scores = np.asarray(scores)
- scores = scores.mean(axis=0)
-
- return scores
-
-
-def __permute(estimator, X, y, best_score, scorer, random_state):
- """
-
- Permute each predictor and measure difference from best score
-
- Args
- ----
- estimator (object): scikit learn estimator
- X, y: 2d and 1d numpy arrays data and labels from a test partition
- best_score (float): best scorer obtained on unperturbed data
- scorer (object): scoring method to use to measure importances
- random_state (float): random seed
-
- Returns
- -------
- scores (2D numpy array): scores for each predictor following permutation
-
- """
-
- rstate = RandomState(random_state)
-
- # permute each predictor variable and assess difference in score
- scores = np.zeros(X.shape[1])
-
- for i in range(X.shape[1]):
- Xscram = np.copy(X)
- Xscram[:, i] = rstate.choice(X[:, i], X.shape[0])
-
- # fit the model on the training data and predict the test data
- y_pred = estimator.predict(Xscram)
- scores[i] = best_score-scorer(y, y_pred)
- if scores[i] < 0:
- scores[i] = 0
-
- return scores
-
-
-def __parallel_fit(estimator, X, y, groups, train_indices, sample_weight):
- """
-
- Fit classifiers/regressors in parallel
-
- Args
- ----
- estimator (object): scikit learn estimator
- X, y: 2D and 1D numpy arrays of training data and labels
- groups (1D numpy array): of len(y) containing group labels
- train_indices, test_indices: 1D numpy arrays of indices to use for
- training/validation
- sample_weight (1D numpy array): of len(y) containing weights to use during
- fitting
-
- """
- from sklearn.pipeline import Pipeline
-
- rs_estimator = deepcopy(estimator)
-
- # create training and test folds
- X_train, y_train = X[train_indices], y[train_indices]
-
- if groups is not None:
- groups_train = groups[train_indices]
- else:
- groups_train = None
-
- # subset training and test fold sample_weight
- if sample_weight is not None:
- weights = sample_weight[train_indices]
-
- # specify fit_params for sample_weights if required
- if isinstance(estimator, Pipeline) and sample_weight is not None:
- fit_params = {'classifier__sample_weight': weights}
- elif not isinstance(estimator, Pipeline) and sample_weight is not None:
- fit_params = {'sample_weight': weights}
- else:
- fit_params = {}
-
- # fit estimator with/without groups
- if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
- rs_estimator.fit(X_train, y_train, groups=groups_train, **fit_params)
- else:
- rs_estimator.fit(X_train, y_train, **fit_params)
-
- return rs_estimator
-
-
-def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
- scoring='accuracy', feature_importances=False,
- n_permutations=25, random_state=None, n_jobs=-1):
- """
-
- Stratified Kfold and GroupFold cross-validation using multiple
- scoring metrics and permutation feature importances
-
- Args
- ----
- estimator (object): Scikit learn estimator
- X, y: 2D and 1D numpy array of training data and labels
- groups (1D numpy array): group labels
- sample_weight (1D numpy array[n_samples,]): sample weights per sample
- cv (integer or object): Number of cross-validation folds or
- sklearn.model_selection object
- scoring (list): List of performance metrics to use
- feature_importances (boolean): option to perform permutation-based importances
- n_permutations (integer): Number of permutations during feature importance
- random_state (float): Seed to pass to the random number generator
-
- Returns
- -------
- scores (dict): Containing lists of scores per cross-validation fold
- byclass_scores (dict): Containing scores per class
- fimp (2D numpy array): permutation feature importances per feature
- clf_resamples (list): List of fitted estimators
- predictions (2d numpy array): with y_true, y_pred, fold
-
- """
-
- from sklearn import metrics
- from sklearn.model_selection import StratifiedKFold
- from sklearn.externals.joblib import Parallel, delayed
-
- # first unwrap the estimator from any potential pipelines or gridsearchCV
- if type(estimator).__name__ == 'Pipeline':
- clf_type = estimator.named_steps['classifier']
- else:
- clf_type = estimator
-
- if type(clf_type).__name__ == 'GridSearchCV' or \
- type(clf_type).__name__ == 'RandomizedSearchCV':
- clf_type = clf_type.best_estimator_
-
- # check name against already multithreaded classifiers
- if type(clf_type).__name__ in [
- 'RandomForestClassifier',
- 'RandomForestRegressor',
- 'ExtraTreesClassifier',
- 'ExtraTreesRegressor',
- 'KNeighborsClassifier']:
- n_jobs=1
-
- # -------------------------------------------------------------------------
- # create copies of estimator and create cross-validation iterator
- # -------------------------------------------------------------------------
-
- # deepcopy estimator
- clf = deepcopy(estimator)
-
- # create model_selection method
- if isinstance(cv, int):
- cv = StratifiedKFold(n_splits=cv)
-
- # -------------------------------------------------------------------------
- # create dictionary of lists to store metrics
- # -------------------------------------------------------------------------
-
- if isinstance(scoring, basestring):
- scoring = [scoring]
- scores = dict.fromkeys(scoring)
- scores = {key: [] for key, value in scores.iteritems()}
- scoring_methods = {'accuracy': metrics.accuracy_score,
- 'balanced_accuracy': metrics.recall_score,
- 'average_precision': metrics.average_precision_score,
- 'brier_loss': metrics.brier_score_loss,
- 'kappa': metrics.cohen_kappa_score,
- 'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'hamming_loss': metrics.hamming_loss,
- 'jaccard_similarity': metrics.jaccard_similarity_score,
- 'log_loss': metrics.log_loss,
- 'matthews_corrcoef': metrics.matthews_corrcoef,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score,
- 'specificity': specificity_score,
- 'roc_auc': metrics.roc_auc_score,
- 'zero_one_loss': metrics.zero_one_loss,
- 'r2': metrics.r2_score,
- 'explained_variance': metrics.explained_variance_score,
- 'neg_mean_absolute_error': metrics.mean_absolute_error,
- 'neg_mean_squared_error': metrics.mean_squared_error,
- 'neg_mean_squared_log_error': metrics.mean_squared_log_error,
- 'neg_median_absolute_error': metrics.median_absolute_error}
-
- byclass_methods = {'f1': metrics.f1_score,
- 'fbeta': metrics.fbeta_score,
- 'precision': metrics.precision_score,
- 'recall': metrics.recall_score}
-
- # create dict to store byclass metrics results
- n_classes = len(np.unique(y))
- labels = np.unique(y)
- byclass_scores = dict.fromkeys(byclass_methods)
- byclass_scores = {key: np.zeros((0, n_classes)) for key, value in byclass_scores.iteritems()}
-
- # remove any byclass_scorers that are not in the scoring list
- byclass_scores = {key: value for key, value in byclass_scores.iteritems() if key in scores}
-
- # check if remaining scorers are valid sklearn metrics
- for i in scores.keys():
- try:
- list(scoring_methods.keys()).index(i)
- except:
- gs.fatal(('Scoring ', i, ' is not a valid scoring method',
- os.linesep, 'Valid methods are: ', scoring_methods.keys()))
-
- # set averaging type for global binary or multiclass scores
- if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
- average = 'binary'
- else:
- average = 'macro'
-
- # create np array to store feature importance scores
- if feature_importances is True:
- fimp = np.zeros((cv.get_n_splits(), X.shape[1]))
- fimp[:] = np.nan
- else:
- fimp = None
-
- # -------------------------------------------------------------------------
- # extract cross-validation indices
- # -------------------------------------------------------------------------
-
- if groups is None:
- k_fold = cv.split(X, y)
- else:
- k_fold = cv.split(X, y, groups=groups)
-
- trains, tests = [], []
- for train_indices, test_indices in k_fold:
- trains.append(train_indices)
- tests.append(test_indices)
-
- # -------------------------------------------------------------------------
- # Perform multiprocessing fitting of clf on each fold
- # -------------------------------------------------------------------------
- clf_resamples = Parallel(n_jobs=n_jobs)(
- delayed(__parallel_fit)(clf, X, y, groups, train_indices, sample_weight)
- for train_indices in trains)
-
- # -------------------------------------------------------------------------
- # loop through each fold and calculate performance metrics
- # -------------------------------------------------------------------------
-
- # store predictions and indices
- predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
-
- fold = 0
- for train_indices, test_indices in zip(trains, tests):
-
- # create training and test folds
- X_test, y_test = X[test_indices], y[test_indices]
-
- # prediction of test fold
- y_pred = clf_resamples[fold].predict(X_test)
- predictions[test_indices, 0] = y_test
- predictions[test_indices, 1] = y_pred
- predictions[test_indices, 2] = fold
-
- # calculate global performance metrics
- for m in scores.keys():
- # metrics that require probabilties
- if m == 'brier_loss' or m == 'roc_auc':
- y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_prob))
-
- # metrics that have no averaging for multiclass
- elif m == 'kappa' or m == 'specificity' or m == 'accuracy' \
- or m == 'hamming_loss' or m == 'jaccard_similarity' \
- or m == 'log_loss' or m == 'zero_one_loss' \
- or m == 'matthews_corrcoef' \
- or m == 'r2' \
- or m == 'explained_variance' \
- or m == 'neg_mean_absolute_error' \
- or m == 'neg_mean_squared_error' \
- or m == 'neg_mean_squared_log_error' \
- or m == 'neg_median_absolute_error':
- scores[m] = np.append(
- scores[m], scoring_methods[m](y_test, y_pred))
-
- # balanced accuracy
- elif m == 'balanced_accuracy':
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average='macro'))
-
- # metrics that have averaging for multiclass
- else:
- scores[m] = np.append(
- scores[m], scoring_methods[m](
- y_test, y_pred, average=average))
-
- # calculate per-class performance metrics
- for key in byclass_scores.keys():
- byclass_scores[key] = np.vstack((
- byclass_scores[key], byclass_methods[key](
- y_test, y_pred, labels=labels, average=None)))
-
- # feature importances using permutation
- if feature_importances is True:
- fimp[fold, :] = varimp_permutation(
- clf_resamples[fold], X_test, y_test, n_permutations,
- scoring_methods[scoring[0]], n_jobs, random_state)
- fold += 1
-
- return(scores, byclass_scores, fimp, clf_resamples, predictions)
More information about the grass-commit
mailing list