[GRASS-SVN] r70931 - grass-addons/grass7/raster/r.learn.ml
svn_grass at osgeo.org
svn_grass at osgeo.org
Sat Apr 22 13:07:02 PDT 2017
Author: spawley
Date: 2017-04-22 13:07:02 -0700 (Sat, 22 Apr 2017)
New Revision: 70931
Modified:
grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
Log:
r.learn.ml fixed issue with loading of previously saved model. Also added option to perform predictions for the cross validation resamples
Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-22 19:39:11 UTC (rev 70930)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py 2017-04-22 20:07:02 UTC (rev 70931)
@@ -20,7 +20,6 @@
#% keyword: machine learning
#% keyword: scikit-learn
#%end
-
#%option G_OPT_I_GROUP
#% key: group
#% label: Imagery group to be classified
@@ -28,7 +27,6 @@
#% required: yes
#% multiple: no
#%end
-
#%option G_OPT_R_INPUT
#% key: trainingmap
#% label: Labelled pixels
@@ -36,7 +34,6 @@
#% required: no
#% guisection: Required
#%end
-
#%option G_OPT_V_INPUT
#% key: trainingpoints
#% label: Training point vector
@@ -44,7 +41,6 @@
#% required: no
#% guisection: Required
#%end
-
#%option G_OPT_DB_COLUMN
#% key: field
#% label: Response attribute column
@@ -52,7 +48,6 @@
#% required: no
#% guisection: Required
#%end
-
#%option G_OPT_R_OUTPUT
#% key: output
#% label: Output Map
@@ -60,7 +55,6 @@
#% guisection: Required
#% required: no
#%end
-
#%option string
#% key: classifier
#% label: Classifier
@@ -70,7 +64,6 @@
#% guisection: Classifier settings
#% required: no
#%end
-
#%option
#% key: c
#% type: double
@@ -80,7 +73,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: max_features
#% type: integer
@@ -90,7 +82,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: max_depth
#% type: integer
@@ -100,7 +91,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: min_samples_split
#% type: integer
@@ -110,7 +100,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: min_samples_leaf
#% type: integer
@@ -120,7 +109,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: n_estimators
#% type: integer
@@ -130,7 +118,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: learning_rate
#% type: double
@@ -140,7 +127,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option
#% key: subsample
#% type: double
@@ -150,7 +136,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option integer
#% key: max_degree
#% label: The maximum degree of terms in forward pass
@@ -159,7 +144,6 @@
#% multiple: yes
#% guisection: Classifier settings
#%end
-
#%option integer
#% key: categorymaps
#% multiple: yes
@@ -167,7 +151,6 @@
#% description: Indices of categorical rasters within the imagery group (0..n) that will be one-hot encoded
#% guisection: Optional
#%end
-
#%option string
#% key: cvtype
#% label: Non-spatial or spatial cross-validation
@@ -176,7 +159,6 @@
#% options: non-spatial,clumped,kmeans
#% guisection: Cross validation
#%end
-
#%option
#% key: n_partitions
#% type: integer
@@ -185,7 +167,6 @@
#% answer: 10
#% guisection: Cross validation
#%end
-
#%option G_OPT_R_INPUT
#% key: group_raster
#% label: Custom group ids for training samples from GRASS raster
@@ -193,7 +174,6 @@
#% required: no
#% guisection: Cross validation
#%end
-
#%option
#% key: cv
#% type: integer
@@ -201,7 +181,6 @@
#% answer: 1
#% guisection: Cross validation
#%end
-
#%option
#% key: n_permutations
#% type: integer
@@ -209,40 +188,39 @@
#% answer: 50
#% guisection: Cross validation
#%end
-
#%flag
#% key: t
#% description: Perform hyperparameter tuning only
#% guisection: Cross validation
#%end
-
#%flag
#% key: f
#% description: Calculate permutation importances during cross validation
#% guisection: Cross validation
#%end
-
+#%flag
+#% key: r
+#% label: Make predictions for cross validation resamples
+#% guisection: Cross validation
+#%end
#%option G_OPT_F_OUTPUT
#% key: errors_file
#% label: Save cross-validation global accuracy results to csv
#% required: no
#% guisection: Cross validation
#%end
-
#%option G_OPT_F_OUTPUT
#% key: fimp_file
#% label: Save feature importances to csv
#% required: no
#% guisection: Cross validation
#%end
-
#%option G_OPT_F_OUTPUT
#% key: param_file
#% label: Save hyperparameter search scores to csv
#% required: no
#% guisection: Cross validation
#%end
-
#%option
#% key: random_state
#% type: integer
@@ -250,7 +228,6 @@
#% answer: 1
#% guisection: Optional
#%end
-
#%option
#% key: lines
#% type: integer
@@ -258,7 +235,6 @@
#% answer: 25
#% guisection: Optional
#%end
-
#%option
#% key: indexes
#% type: integer
@@ -267,7 +243,6 @@
#% guisection: Optional
#% multiple: yes
#%end
-
#%option
#% key: n_jobs
#% type: integer
@@ -275,77 +250,65 @@
#% answer: -2
#% guisection: Optional
#%end
-
#%flag
#% key: s
#% label: Standardization preprocessing
#% guisection: Optional
#%end
-
#%flag
#% key: i
#% label: Impute training data preprocessing
#% guisection: Optional
#%end
-
#%flag
#% key: p
#% label: Output class membership probabilities
#% guisection: Optional
#%end
-
#%flag
#% key: z
#% label: Only predict class probabilities
#% guisection: Optional
#%end
-
#%flag
#% key: m
#% description: Build model only - do not perform prediction
#% guisection: Optional
#%end
-
#%flag
#% key: b
#% description: Balance training data using class weights
#% guisection: Optional
#%end
-
#%flag
#% key: l
#% label: Use memory swap
#% guisection: Optional
#%end
-
#%option G_OPT_F_OUTPUT
#% key: save_training
#% label: Save training data to csv
#% required: no
#% guisection: Optional
#%end
-
#%option G_OPT_F_INPUT
#% key: load_training
#% label: Load training data from csv
#% required: no
#% guisection: Optional
#%end
-
#%option G_OPT_F_OUTPUT
#% key: save_model
#% label: Save model from file
#% required: no
#% guisection: Optional
#%end
-
#%option G_OPT_F_INPUT
#% key: load_model
#% label: Load model from file
#% required: no
#% guisection: Optional
#%end
-
#%rules
#% exclusive: trainingmap,load_model
#% exclusive: load_training,save_training
@@ -463,7 +426,7 @@
def cross_val_scores(estimator, X, y, groups=None, sample_weight=None, cv=3,
scoring=['accuracy'], feature_importances=False,
- n_permutations=25, random_state=None):
+ n_permutations=25, models=False, random_state=None):
"""
Stratified Kfold and GroupFold cross-validation using multiple
@@ -481,6 +444,7 @@
scoring: List of performance metrics to use
feature_importances: Boolean to perform permutation-based importances
n_permutations: Number of permutations during feature importance
+ models: Boolean, return a list of the fitted models
random_state: Seed to pass to the random number generator
"""
@@ -489,6 +453,7 @@
RandomizedSearchCV, GridSearchCV, StratifiedKFold)
estimator = deepcopy(estimator)
+ fitted_models = []
# create model_selection method
if isinstance(cv, int):
@@ -505,7 +470,7 @@
# create dictionary of lists to store metrics
scores = dict.fromkeys(scoring)
- scores = { key: [] for key, value in scores.iteritems()}
+ scores = {key: [] for key, value in scores.iteritems()}
scoring_methods = {'accuracy': metrics.accuracy_score,
'balanced_accuracy': metrics.recall_score,
'average_precision': metrics.average_precision_score,
@@ -588,6 +553,9 @@
else:
if sample_weight is None: estimator.fit(X_train, y_train)
else: estimator.fit(X_train, y_train, sample_weight=weights)
+
+ if models is True:
+ fitted_models.append(deepcopy(estimator))
# prediction of test fold
y_pred = estimator.predict(X_test)
@@ -639,10 +607,10 @@
n_permutations, scoring_methods[scoring[0]],
random_state)))
- return(scores, byclass_scores, fimp)
+ return(scores, byclass_scores, fimp, fitted_models)
-def predict(estimator, predictors, output, predict_type='raw', labels=None,
+def predict(estimator, predictors, output, predict_type='raw',
index=None, rowincr=25):
"""
@@ -659,18 +627,8 @@
rowincr: Integer of raster rows to process at one time
"""
- # current region
- current = Region()
-
- # determine output data type and nodata
- if labels is not None:
- ftype = 'CELL'
- nodata = -2147483648
- else:
- ftype = 'FCELL'
- nodata = np.nan
-
# open predictors as list of rasterrow objects
+ current = Region()
n_features = len(predictors)
rasstack = [0] * n_features
@@ -682,11 +640,6 @@
grass.fatal("GRASS raster " + predictors[i] +
" does not exist.... exiting")
- # create and open RasterRow object for writing of classification result
- if predict_type == 'raw':
- classification = RasterRow(output)
- classification.open('w', ftype, overwrite=True)
-
# Prediction using row blocks
for rowblock in range(0, current.rows, rowincr):
grass.percent(rowblock, current.rows, rowincr)
@@ -722,10 +675,24 @@
result = estimator.predict(flat_pixels)
result = result.reshape((rowincr, current.cols))
+ # determine nodata value and grass raster type
+ if result.dtype == 'float':
+ nodata = np.nan
+ ftype = 'FCELL'
+ else:
+ nodata = -2147483648
+ ftype = 'CELL'
+
# replace NaN values so that the prediction does not have a border
result[np.nonzero(np.isnan(mask))] = nodata
# for each row we can perform computation, and write the result
+ if rowblock == 0:
+ # create and open RasterRow object for writing of classification result
+ if predict_type == 'raw':
+ classification = RasterRow(output)
+ classification.open('w', ftype, overwrite=True)
+
for row in range(rowincr):
newrow = Buffer((result.shape[1],), mtype=ftype)
newrow[:] = result[row, :]
@@ -755,8 +722,6 @@
for iclass, label in enumerate(index):
result_proba_class = result_proba[:, label]
result_proba_class = result_proba_class.reshape((rowincr, current.cols))
-
- # replace NaN values so that the prediction does not have a border
result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
for row in range(rowincr):
@@ -766,10 +731,7 @@
# close all maps
for i in range(n_features): rasstack[i].close()
-
- # close all class probability maps
- if predict_type == 'raw':
- classification.close()
+ if predict_type == 'raw': classification.close()
if predict_type == 'prob':
try:
for iclass in range(n_classes):
@@ -906,7 +868,6 @@
random_state=random_state,
n_jobs=n_jobs,
oob_score=False),
-
'GradientBoostingClassifier':
GradientBoostingClassifier(learning_rate=p['learning_rate'],
n_estimators=p['n_estimators'],
@@ -1109,9 +1070,11 @@
Args
----
group: String; GRASS imagery group
+
Returns
-------
- maplist: Python list containing individual GRASS raster maps
+ maplist: List containing individual GRASS raster maps
+ map_names: List with print friendly map names
"""
groupmaps = im.group(group=group, flags="g",
quiet=True, stdout_=PIPE).outputs.stdout
@@ -1142,24 +1105,19 @@
y: 1D numpy array with the response variable
coordinates: 2D numpy array of sample coordinates
"""
-
- import pandas as pd
-
# open grass vector
points = VectorTopo(gvector.split('@')[0])
points.open('r')
# create link to attribute table
points.dblinks.by_name(name=gvector)
- link = points.dblinks[0]
- # convert to pandas array
- gvector_df = pd.DataFrame(points.table_to_dict()).T
- gvector_df.columns = points.table.columns
- y = gvector_df.loc[:, field].as_matrix()
- y = y.astype(float)
+ # extract table field to numpy array
+ table = points.table
+ cur = table.execute("SELECT {field} FROM {name}".format(field=field, name=table.name))
+ y = np.array([np.isnan if c is None else c[0] for c in cur])
- # extract training data
+ # extract raster data
X = np.zeros((points.num_primitives()['point'], len(grasters)), dtype=float)
for i, raster in enumerate(grasters):
rio = RasterRow(raster)
@@ -1197,11 +1155,6 @@
except:
grass.fatal("Scikit learn 0.18 or newer is not installed")
- try:
- import pandas as pd
- except:
- grass.fatal("Pandas is not installed")
-
group = options['group']
trainingmap = options['trainingmap']
trainingpoints = options['trainingpoints']
@@ -1220,6 +1173,7 @@
probability = flags['p']
prob_only = flags['z']
tuneonly = flags['t']
+ predict_resamples = flags['r']
rowincr = int(options['lines'])
random_state = int(options['random_state'])
model_save = options['save_model']
@@ -1227,6 +1181,7 @@
load_training = options['load_training']
save_training = options['save_training']
importances = flags['f']
+
indexes = options['indexes']
if ',' in indexes:
indexes = [int(i) for i in indexes.split(',')]
@@ -1241,6 +1196,7 @@
errors_file = options['errors_file']
fimp_file = options['fimp_file']
param_file = options['param_file']
+
balance = flags['b']
if balance is True:
balance = 'balanced'
@@ -1248,16 +1204,16 @@
if ',' in categorymaps:
categorymaps = [int(i) for i in categorymaps.split(',')]
else: categorymaps = None
-
+
# error checking
# feature importances selected by no cross-validation scheme used
if importances is True and cv == 1:
grass.fatal('Feature importances require cross-validation cv > 1')
-
+
# output map has not been entered and modelonly is not set to True
if output == '' and modelonly is True:
grass.fatal('No output map specified')
-
+
# perform prediction only for class probabilities but probability flag is not set to True
if prob_only is True:
probability = True
@@ -1484,14 +1440,20 @@
grass.message('Best parameters:')
grass.message(str(clf.best_params_))
if param_file != '':
- param_df = pd.DataFrame(clf.cv_results_)
- param_df.to_csv(param_file)
+ try:
+ import pandas as pd
+ param_df = pd.DataFrame(clf.cv_results_)
+ param_df.to_csv(param_file)
+ except:
+ grass.message((
+ "Pandas is not installed ",
+ "cannot export hyperparameter search results to csv"))
# cross-validation
# -----------------
# If cv > 1 then use cross-validation to generate performance measures
- if cv > 1 and tuneonly is not True :
+ if cv > 1 and tuneonly is not True:
if mode == 'classification' and cv > np.histogram(
y, bins=len(np.unique(y)))[0].min():
grass.message(os.linesep)
@@ -1503,14 +1465,15 @@
grass.message(
"Cross validation global performance measures......:")
- # cross-validate the training object
+ # cross-validate
if mode == 'classification' and \
len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
scoring.append('roc_auc')
- scoring.append('matthews_corrcoef'),
- scores, cscores, fimp = cross_val_scores(
+ scoring.append('matthews_corrcoef')
+
+ scores, cscores, fimp, models = cross_val_scores(
clf, X, y, group_id, class_weights, resampling, scoring,
- importances, n_permutations, random_state)
+ importances, n_permutations, predict_resamples, random_state)
# global scores
for method, val in scores.iteritems():
@@ -1572,17 +1535,35 @@
if prob_only is False:
grass.message('Predicting classification raster...')
predict(estimator=clf, predictors=maplist, output=output, predict_type='raw',
- labels=np.unique(y), rowincr=rowincr)
+ rowincr=rowincr)
+ if predict_resamples is True:
+ for i in range(cv):
+ resample_name = output + '_Resample' + str(i)
+ predict(estimator=models[i], predictors=maplist, output=resample_name, predict_type='raw',
+ rowincr=rowincr)
+
if probability is True:
grass.message('Predicting class probabilities...')
predict(estimator=clf, predictors=maplist, output=output, predict_type='prob',
- labels=np.unique(y), index=indexes, rowincr=rowincr)
+ index=indexes, rowincr=rowincr)
+ if predict_resamples is True:
+ for i in range(cv):
+ resample_name = output + '_Resample' + str(i)
+ predict(estimator=models[i], predictors=maplist, output=resample_name, predict_type='prob',
+ index=indexes, rowincr=rowincr)
+
elif mode == 'regression':
grass.message('Predicting regression raster...')
predict(estimator=clf, predictors=maplist, output=output, predict_type='raw',
- labels=None, rowincr=rowincr)
+ rowincr=rowincr)
+
+ if predict_resamples is True:
+ for i in range(cv):
+ resample_name = output + '_Resample' + str(i)
+ predict(estimator=models[i], predictors=maplist, output=resample_name, predict_type='prob',
+ rowincr=rowincr)
else:
grass.message("Model built and now exiting")
More information about the grass-commit
mailing list