[GRASS-SVN] r71040 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Sat May 6 11:46:13 PDT 2017
Author: spawley
Date: 2017-05-06 11:46:13 -0700 (Sat, 06 May 2017)
New Revision: 71040
Modified:
grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
Log:
r.learn.ml turn off multiprocessing automatically for multithreaded classifiers
Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py 2017-05-06 09:18:09 UTC (rev 71039)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_crossval.py 2017-05-06 18:46:13 UTC (rev 71040)
@@ -136,9 +136,6 @@
applied only to XGBoost and Gradient Boosting classifiers
"""
- from sklearn.model_selection import (
- RandomizedSearchCV, GridSearchCV, StratifiedKFold)
-
# create training and test folds
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
@@ -149,7 +146,7 @@
if sample_weight is not None: weights = sample_weight[train_indices]
# train estimator
- if groups is not None and isinstance(estimator, (RandomizedSearchCV, GridSearchCV)) is True:
+ if groups is not None and type(estimator).__name__ in ['RandomizedSearchCV', 'GridSearchCV']:
if sample_weight is None: estimator.fit(X_train, y_train, groups=groups_train)
else: estimator.fit(X_train, y_train, groups=groups_train, sample_weight=weights)
else:
Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py 2017-05-06 09:18:09 UTC (rev 71039)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py 2017-05-06 18:46:13 UTC (rev 71040)
@@ -201,73 +201,188 @@
" does not exist.... exiting")
# -------------------------------------------------------------------------
- # parallel prediction
+ # turn off multiprocessing for multi-threaded classifiers
# -------------------------------------------------------------------------
- # create lists of row increments
- row_mins, row_maxs = [], []
- for row in range(0, current.rows, rowincr):
- if row+rowincr > current.rows:
- rowincr = current.rows - row
- row_mins.append(row)
- row_maxs.append(row+rowincr)
+ # first unwrap the estimator from any potential pipelines or gridsearchCV
+ if type(estimator).__name__ == 'Pipeline':
+ clf_type = estimator.named_steps['classifier']
+ else:
+ clf_type = estimator
- # perform predictions on lists of row increments in parallel
- predictions = Parallel(n_jobs=n_jobs)(
- delayed(__predict_parallel)
- (estimator, predictors, predict_type, current, row_min, row_max)
- for row_min, row_max in zip(row_mins, row_maxs))
+ if type(clf_type).__name__ == 'GridSearchCV' or \
+ type(clf_type).__name__ == 'RandomizedSearchCV':
+ clf_type = clf_type.best_estimator_
- # unpack the results
- results = []
- ftypes = []
- for block in predictions:
- results.append(block[0])
- ftypes.append(block[1])
+ # check name against already multithreaded classifiers
+ if n_jobs == 1 or type(clf_type).__name__ in ['RandomForestClassifier',
+ 'RandomForestRegressor',
+ 'ExtraTreesClassifier',
+ 'ExtraTreesRegressor',
+ 'KNeighborsClassifier',
+ 'XGBClassifier',
+ 'XGBRegressor']:
+ # ---------------------------------------------------------------------
+ # sequential prediction
+ # ---------------------------------------------------------------------
- # -------------------------------------------------------------------------
- # writing of predicted results for classification
- # -------------------------------------------------------------------------
- if predict_type == 'raw':
- classification = RasterRow(output)
- classification.open('w', ftypes[0], overwrite=True)
+ # Prediction using blocks of rows per iteration
+ for rowblock in range(0, current.rows, rowincr):
+ gscript.percent(rowblock, current.rows, rowincr)
- # write the classification result
- for result_block in results:
- for row in range(result_block.shape[0]):
- newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
- newrow[:] = result_block[row, :]
- classification.put_row(newrow)
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
- # -------------------------------------------------------------------------
- # writing of predicted results for probabilities
- # -------------------------------------------------------------------------
- if predict_type == 'prob':
- # determine number of classes
- if index is None:
- index = range(results[0].shape[2])
- n_classes = len(index)
- else:
- n_classes = len(np.unique(index))
+ # loop through each row, and each band and add to 2D img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
- # create and open RasterRow objects for probabilities
- prob_out_raster = [0] * n_classes
- prob = [0] * n_classes
- for iclass, label in enumerate(index):
- prob_out_raster[iclass] = output + '_classPr' + str(label)
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
+ # create mask
+ img_np_row[img_np_row == -2147483648] = np.nan
+ mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+ for feature in range(n_features):
+ invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+ mask[invalid_indexes] = np.nan
- # write the class probability results
- for results_proba_block in results:
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaNs prior to passing to scikit-learn predict
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # perform prediction for classification/regression
+ if predict_type == 'raw':
+ result = estimator.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
+
+ # determine nodata value and grass raster type
+ if result.dtype == 'float':
+ nodata = np.nan
+ ftype = 'FCELL'
+ else:
+ nodata = -2147483648
+ ftype = 'CELL'
+
+ # replace NaN values so that the prediction does not have a border
+ result[np.nonzero(np.isnan(mask))] = nodata
+
+ # on first iteration create the RasterRow object
+ if rowblock == 0:
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftype, overwrite=True)
+
+ # write the classification result
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
+
+ # perform prediction for class probabilities
+ if predict_type == 'prob':
+ result_proba = estimator.predict_proba(flat_pixels)
+
+ # on first loop determine number of probability classes
+ # and open rasterrow objects for writing
+ if rowblock == 0:
+ if index is None:
+ index = range(result_proba.shape[1])
+ n_classes = len(index)
+ else:
+ n_classes = len(np.unique(index))
+
+ # create and open RasterRow objects for probabilities
+ prob_out_raster = [0] * n_classes
+ prob = [0] * n_classes
+ for iclass, label in enumerate(index):
+ prob_out_raster[iclass] = output + '_classPr' + str(label)
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ for iclass, label in enumerate(index):
+ result_proba_class = result_proba[:, label]
+ result_proba_class = result_proba_class.reshape((rowincr, current.cols))
+ result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
+
+ for row in range(rowincr):
+ newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+ else:
+
+ # ---------------------------------------------------------------------
+ # parallel prediction
+ # ---------------------------------------------------------------------
+
+ # create lists of row increments
+ row_mins, row_maxs = [], []
+ for row in range(0, current.rows, rowincr):
+ if row+rowincr > current.rows:
+ rowincr = current.rows - row
+ row_mins.append(row)
+ row_maxs.append(row+rowincr)
+
+ # perform predictions on lists of row increments in parallel
+ predictions = Parallel(n_jobs=n_jobs, max_nbytes=None)(
+ delayed(__predict_parallel)
+ (estimator, predictors, predict_type, current, row_min, row_max)
+ for row_min, row_max in zip(row_mins, row_maxs))
+
+ # unpack the results
+ results = []
+ ftypes = []
+ for block in predictions:
+ results.append(block[0])
+ ftypes.append(block[1])
+
+ # -------------------------------------------------------------------------
+ # writing of predicted results for classification
+ # -------------------------------------------------------------------------
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftypes[0], overwrite=True)
+
+ # write the classification result
+ for result_block in results:
+ for row in range(result_block.shape[0]):
+ newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
+ newrow[:] = result_block[row, :]
+ classification.put_row(newrow)
+
+ # -------------------------------------------------------------------------
+ # writing of predicted results for probabilities
+ # -------------------------------------------------------------------------
+ if predict_type == 'prob':
+ # determine number of classes
+ if index is None:
+ index = range(results[0].shape[2])
+ n_classes = len(index)
+ else:
+ n_classes = len(np.unique(index))
+
+ # create and open RasterRow objects for probabilities
+ prob_out_raster = [0] * n_classes
+ prob = [0] * n_classes
for iclass, label in enumerate(index):
- result_proba_class = results_proba_block[:, :, label]
+ prob_out_raster[iclass] = output + '_classPr' + str(label)
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
- for row in range(result_proba_class.shape[0]):
- newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
+ # write the class probability results
+ for results_proba_block in results:
+ for iclass, label in enumerate(index):
+ result_proba_class = results_proba_block[:, :, label]
+ for row in range(result_proba_class.shape[0]):
+ newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
# -------------------------------------------------------------------------
# close all maps
# -------------------------------------------------------------------------
More information about the grass-commit
mailing list