[GRASS-SVN] r70999 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Tue May 2 09:45:13 PDT 2017
Author: spawley
Date: 2017-05-02 09:45:12 -0700 (Tue, 02 May 2017)
New Revision: 70999
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py
Log:
r.learn.ml using multiprocessing for all cross-validations
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-05-02 13:41:15 UTC (rev 70998)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-05-02 16:45:12 UTC (rev 70999)
@@ -249,13 +249,15 @@
#%flag
#% key: f
-#% description: Calculate permutation importances during cross validation
+#% label: Estimator permutation-based feature importances
+#% description: Estimate feature importance using a permutation-based method
#% guisection: Cross validation
#%end
#%flag
#% key: r
#% label: Make predictions for cross validation resamples
+#% description: Produce raster predictions for all cross validation resamples
#% guisection: Cross validation
#%end
@@ -672,7 +674,6 @@
inner = ShuffleSplit(n_splits=1, test_size=0.33, random_state=random_state)
else:
inner = GroupShuffleSplit(n_splits=1, test_size=0.33, random_state=random_state)
-
else:
inner = None
@@ -802,10 +803,10 @@
# perform the cross-validatation
scores, cscores, fimp, models, preds = cross_val_scores(
clf, X, y, group_id, class_weights, outer, scoring,
- importances, n_permutations, predict_resamples, random_state)
+ importances, n_permutations, random_state, n_jobs)
+
preds = np.hstack((preds, sample_coords))
- # global scores
for method, val in scores.iteritems():
gscript.message(
method+":\t%0.3f\t+/-SD\t%0.3f" %
Modified: grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py 2017-05-02 13:41:15 UTC (rev 70998)
+++ grass-addons/grass7/raster/r.learn.ml/r_learn_utils.py 2017-05-02 16:45:12 UTC (rev 70999)
@@ -100,9 +100,34 @@
return scores
+def parallel_fit(estimator, X, y, groups, train_indices, test_indices, sample_weight):
+
+ from sklearn.model_selection import (
+ RandomizedSearchCV, GridSearchCV, StratifiedKFold)
+
+ # create training and test folds
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+ if groups is not None: groups_train = groups[train_indices]
+ else: groups_train = None
+
+ # subset training and test fold sample_weight
+ if sample_weight is not None: weights = sample_weight[train_indices]
+
+ # train estimator
+ if groups is not None and isinstance(estimator, (RandomizedSearchCV, GridSearchCV)) is True:
+ if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
+ else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
+ else:
+ if sample_weight is None: estimator.fit(X_train, y_train)
+ else: estimator.fit(X_train, y_train, sample_weight=weights)
+
+ return estimator
+
+
def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
scoring='accuracy', feature_importances=False,
- n_permutations=25, models=False, random_state=None):
+ n_permutations=25, random_state=None, n_jobs=-1):
"""
Stratified Kfold and GroupFold cross-validation using multiple
scoring metrics and permutation feature importances
@@ -110,32 +135,46 @@
Args
----
estimator: Scikit learn estimator
- X: 2D numpy array of training data
- y: 1D numpy array representing response variable
+ X, y: 2D and 1D numpy array of training data and labels
groups: 1D numpy array containing group labels
sample_weight: 1D numpy array[n_samples,] of sample weights
cv: Integer of cross-validation folds or sklearn.model_selection object
- sampling: Over- or under-sampling object with fit_sample method
scoring: List of performance metrics to use
feature_importances: Boolean to perform permutation-based importances
n_permutations: Number of permutations during feature importance
- models: Boolean, return a list of the fitted models
random_state: Seed to pass to the random number generator
+
+ Returns
+ -------
+ scores: Dict, containing lists of scores per cross-validation fold
+ byclass_scores: Dict, containing scores per class
+ fimp: 2D numpy array of permutation feature importances per feature
+ clf_resamples: List, fitted estimators
+ predictions: 2D numpy array with y_true, y_pred, fold
"""
from sklearn import metrics
from sklearn.model_selection import (
RandomizedSearchCV, GridSearchCV, StratifiedKFold)
+ from sklearn.externals.joblib import Parallel, delayed
+ # -------------------------------------------------------------------------
+ # create copies of estimator and create cross-validation iterator
+ # -------------------------------------------------------------------------
+
# deepcopy estimator
- estimator = deepcopy(estimator)
- fitted_models = []
+ clf = deepcopy(estimator)
# create model_selection method
- if isinstance(cv, int): cv = StratifiedKFold(n_splits=cv)
+ if isinstance(cv, int):
+ cv = StratifiedKFold(n_splits=cv)
+ # -------------------------------------------------------------------------
# create dictionary of lists to store metrics
- if isinstance(scoring, basestring): scoring = [scoring]
+ # -------------------------------------------------------------------------
+
+ if isinstance(scoring, basestring):
+ scoring = [scoring]
scores = dict.fromkeys(scoring)
scores = {key: [] for key, value in scores.iteritems()}
scoring_methods = {'accuracy': metrics.accuracy_score,
@@ -162,7 +201,7 @@
'precision': metrics.precision_score,
'recall': metrics.recall_score}
- # create diction to store byclass metrics results
+ # create dict to store byclass metrics results
n_classes = len(np.unique(y))
labels = np.unique(y)
byclass_scores = dict.fromkeys(byclass_methods)
@@ -176,9 +215,9 @@
try:
list(scoring_methods.keys()).index(i)
except:
- print('Scoring ' + i + ' is not a valid scoring method')
- print('Valid methods are:')
- print(scoring_methods.keys())
+ gscript.fatal('Scoring ' + i + ' is not a valid scoring method')
+ gscript.message('Valid methods are:')
+ gscript.message(scoring_methods.keys())
# set averaging type for global binary or multiclass scores
if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
@@ -193,41 +232,48 @@
else:
fimp = None
- # generate Kfold indices
+ # -------------------------------------------------------------------------
+ # extract cross-validation indices
+ # -------------------------------------------------------------------------
+
if groups is None:
k_fold = cv.split(X, y)
else:
k_fold = cv.split(X, y, groups=groups)
+ trains, tests = [], []
+ for train_indices, test_indices in k_fold:
+ trains.append(deepcopy(train_indices))
+ tests.append(deepcopy(test_indices))
+
+ # -------------------------------------------------------------------------
+ # Perform multiprocessing fitting of clf on each fold
+ # -------------------------------------------------------------------------
+
+ # Multiprocessing-backed parallel loops cannot be nested, setting n_jobs=1
+ if isinstance(clf, (GridSearchCV, RandomizedSearchCV)):
+ n_jobs = 1
+
+ clf_resamples = Parallel(n_jobs=n_jobs)(
+ delayed(parallel_fit)(clf, X, y, groups, train_indices,
+ test_indices, sample_weight)
+ for train_indices, test_indices in zip(trains, tests))
+
# store predictions and indices
predictions = np.zeros((len(y), 3)) # y_true, y_pred, fold
- # train on k-1 folds and test of k folds
+ # -------------------------------------------------------------------------
+ # loop through each fold and calculate performance metrics
+ # -------------------------------------------------------------------------
fold = 0
- for train_indices, test_indices in k_fold:
+ for train_indices, test_indices in zip(trains, tests):
# create training and test folds
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
- if groups is not None: groups_train = groups[train_indices]
- else: groups_train = None
- # subset training and test fold sample_weight
- if sample_weight is not None: weights = sample_weight[train_indices]
-
- # train estimator
- if groups is not None and isinstance(estimator, (RandomizedSearchCV, GridSearchCV)) is True:
- if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
- else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
- else:
- if sample_weight is None: estimator.fit(X_train, y_train)
- else: estimator.fit(X_train, y_train, sample_weight=weights)
-
- # optionally store the fitted models on each resample
- if models is True: fitted_models.append(deepcopy(estimator))
-
# prediction of test fold
- y_pred = estimator.predict(X_test)
+ y_pred = clf_resamples[fold].predict(X_test)
predictions[test_indices, 0] = y_test
predictions[test_indices, 1] = y_pred
predictions[test_indices, 2] = fold
@@ -236,7 +282,7 @@
for m in scores.keys():
# metrics that require probabilties
if m == 'brier_loss' or m == 'roc_auc':
- y_prob = estimator.predict_proba(X_test)[:, 1]
+ y_prob = clf_resamples[fold].predict_proba(X_test)[:, 1]
scores[m] = np.append(
scores[m], scoring_methods[m](y_test, y_prob))
@@ -269,18 +315,18 @@
if feature_importances is True:
if bool((np.isnan(fimp)).all()) is True:
fimp = varimp_permutation(
- estimator, X_test, y_test, n_permutations,
+ clf_resamples[fold], X_test, y_test, n_permutations,
scoring_methods[scoring[0]],
random_state)
else:
fimp = np.row_stack(
(fimp, varimp_permutation(
- estimator, X_test, y_test,
+ clf_resamples[fold], X_test, y_test,
n_permutations, scoring_methods[scoring[0]],
random_state)))
fold += 1
- return(scores, byclass_scores, fimp, fitted_models, predictions)
+ return(scores, byclass_scores, fimp, clf_resamples, predictions)
def predict(estimator, predictors, output, predict_type='raw',
@@ -315,7 +361,10 @@
gscript.fatal("GRASS raster " + predictors[i] +
" does not exist.... exiting")
+ # -------------------------------------------------------------------------
# Prediction using blocks of rows per iteration
+ # -------------------------------------------------------------------------
+
for rowblock in range(0, current.rows, rowincr):
gscript.percent(rowblock, current.rows, rowincr)
@@ -403,7 +452,9 @@
newrow[:] = result_proba_class[row, :]
prob[iclass].put_row(newrow)
+ # -------------------------------------------------------------------------
# close all maps
+ # -------------------------------------------------------------------------
for i in range(n_features): rasstack[i].close()
if predict_type == 'raw': classification.close()
if predict_type == 'prob':
More information about the grass-commit
mailing list