[GRASS-SVN] r69684 - grass-addons/grass7/raster/r.randomforest
svn_grass at osgeo.org
svn_grass at osgeo.org
Thu Oct 6 21:29:33 PDT 2016
Author: spawley
Date: 2016-10-06 21:29:32 -0700 (Thu, 06 Oct 2016)
New Revision: 69684
Modified:
grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
added ncores option and fixed difference in behaviour between scikit learn 0.17 and 0.18
Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-10-06 23:45:34 UTC (rev 69683)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-10-07 04:29:32 UTC (rev 69684)
@@ -1,7 +1,6 @@
#!/usr/bin/env python
############################################################################
-#
-# MODULE: r.scikit.learn
+# MODULE: r.randomforest
# AUTHOR: Steven Pawley
# PURPOSE: Supervised classification and regression of GRASS rasters using the
# python scikit-learn package
@@ -227,6 +226,14 @@
#% guisection: Optional
#%end
+#%option
+#% key: ncores
+#% type: integer
+#% description: Number of processing cores. Default -1 is all cores
+#% answer: -1
+#% guisection: Optional
+#%end
+
#%flag
#% key: p
#% label: Output class membership probabilities
@@ -303,7 +310,8 @@
def cleanup():
- grass.run_command("g.remove", name='clfmasktmp', flags="f", type="raster", quiet=True)
+ grass.run_command("g.remove", name='clfmasktmp', flags="f",
+ type="raster", quiet=True)
def sample_predictors_byrow(response, predictors):
@@ -320,9 +328,10 @@
if rasstack[i].exist() == True:
rasstack[i].open('r')
else:
- grass.fatal("GRASS raster " + maplist[i] + " does not exist.... exiting")
+ grass.fatal("GRASS raster " + maplist[i] +
+ " does not exist.... exiting")
- # use grass.pygrass.gis.region to get information about the current region, particularly
+ # use grass.pygrass.gis.region to get information about the current region
current = Region()
# determine cell storage type of training roi raster
@@ -335,7 +344,7 @@
roi_stats = roi_stats.split(os.linesep)[0]
nlabel_pixels = int(str(roi_stats).split('=')[1])
- # Create a zero numpy array with the dimensions of the number of columns in the region
+ # Create a zero numpy array with the dimensions of the number of columns
# and the number of bands plus an additional band to attach the labels
tindex = 0
training_labels = []
@@ -348,20 +357,25 @@
roi_row_np = roi_raster[row]
# check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan and this is not returned anyway
+ # can use even if roi is FCELL because nodata will be nan and this is
+
+ # not returned anyway
is_train = np.nonzero(roi_row_np > -2147483648)
training_labels = np.append(training_labels, roi_row_np[is_train])
nlabels_in_row = np.array(is_train).shape[1]
# if there are any labelled pixels
- # loop through each imagery band for that row and put the data into the img_row_band_np array
+ # loop through each imagery band for that row and put the data into
+
+ # the img_row_band_np array
if np.isnan(roi_row_np).all() != True:
for band in range(n_features):
imagerow_np = rasstack[band][row]
# attach the label values onto the last column
- training_data[tindex : tindex+nlabels_in_row, band] = imagerow_np[is_train]
+ training_data[tindex : tindex+nlabels_in_row, band] =\
+ imagerow_np[is_train]
tindex = tindex + nlabels_in_row
@@ -402,10 +416,12 @@
for i in range(n_features):
if RasterRow(predictors[i]).exist() != True:
- grass.fatal("GRASS raster " + predictors[i] + " does not exist.... exiting")
+ grass.fatal("GRASS raster " + predictors[i]
+ + " does not exist.... exiting")
# check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan and this is not returned anyway
+ # can use even if roi is FCELL because nodata will be nan and this is not
+ # returned anyway
is_train = np.nonzero(response_np > -2147483648)
training_labels = response_np[is_train]
n_labels = np.array(is_train).shape[1]
@@ -430,13 +446,13 @@
# Remove nan rows from training data
training_data = training_data[~np.isnan(training_data).any(axis=1)]
-
roi_gr.close()
# return X and y data
return(training_data[:, 0:n_features], training_data[:, n_features])
-def prediction(clf, predictors, class_probabilities, rowincr, output, mode, labels):
+def prediction(clf, predictors, class_probabilities,
+ rowincr, output, mode, labels):
class_list = np.unique(labels)
nclasses = len(class_list)
@@ -449,16 +465,18 @@
if rasstack[i].exist() == True:
rasstack[i].open('r')
else:
- grass.fatal("GRASS raster " + maplist[i] + " does not exist.... exiting")
+ grass.fatal("GRASS raster " + maplist[i]
+ + " does not exist.... exiting")
- # use grass.pygrass.gis.region to get information about the current region, particularly
+ # use grass.pygrass.gis.region to get information about the current region
current = Region()
# create a imagery mask
- # the input rasters might have different dimensions in terms of value and non-value pixels.
- # r.series used to automatically create a mask by propagating the null values
+ # the input rasters might have different dimensions and non-value pixels.
+ # r.series used to automatically create a mask by propagating the nulls
clfmask = 'clfmasktmp'
- grass.run_command("r.series", output=clfmask, input=predictors, method='count', flags='n')
+ grass.run_command("r.series", output=clfmask,
+ input=predictors, method='count', flags='n')
mask_raster = RasterRow(clfmask)
mask_raster.open('r')
@@ -478,7 +496,8 @@
prob_out_raster = [0] * nclasses
prob = [0] * nclasses
for iclass in range(nclasses):
- prob_out_raster[iclass] = output + '_classPr' + str(int(class_list[iclass]))
+ prob_out_raster[iclass] = output + \
+ '_classPr' + str(int(class_list[iclass]))
prob[iclass] = RasterRow(prob_out_raster[iclass])
prob[iclass].open('w', 'FCELL', overwrite=True)
@@ -493,11 +512,14 @@
img_np_row = np.zeros((rowincr, current.cols, n_features))
mask_np_row = np.zeros((rowincr, current.cols))
- # loop through each row, and each band and add these values to the 2D array img_np_row
+ # loop through each row, and each band
+
+ # and add these values to the 2D array img_np_row
for row in range(rowblock, rowblock+rowincr, 1):
mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
for band in range(n_features):
- img_np_row[row-rowblock, :, band] = np.array(rasstack[band][row])
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
mask_np_row[mask_np_row == -2147483648] = np.nan
nanmask = np.isnan(mask_np_row) # True in the mask means invalid data
@@ -511,10 +533,10 @@
result = clf.predict(flat_pixels)
result = result.reshape((rowincr, current.cols))
- # replace NaN values so that the prediction surface does not have a border
+ # replace NaN values so that the prediction does not have a border
result = np.ma.masked_array(result, mask=nanmask, fill_value=np.nan)
- # return a copy of result, with masked values filled with a given value
+ # return a copy of result, with masked values filled with a value
result = result.filled([nodata])
# for each row we can perform computation, and write the result into
@@ -526,13 +548,20 @@
# same for probabilities
if class_probabilities == True and mode == 'classification':
result_proba = clf.predict_proba(flat_pixels)
+
for iclass in range(nclasses):
result_proba_class = result_proba[:, iclass]
- result_proba_class = result_proba_class.reshape((rowincr, current.cols))
- result_proba_class = np.ma.masked_array(result_proba_class, mask=nanmask, fill_value=np.nan)
+ result_proba_class = \
+ result_proba_class.reshape((rowincr, current.cols))
+
+ result_proba_class = \
+ np.ma.masked_array(result_proba_class,
+ mask=nanmask, fill_value=np.nan)
result_proba_class = result_proba_class.filled([np.nan])
+
for row in range(rowincr):
- newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
+ newrow = Buffer((result_proba_class.shape[1],),
+ mtype='FCELL')
newrow[:] = result_proba_class[row, :]
prob[iclass].put_row(newrow)
@@ -561,7 +590,7 @@
def cross_val_classification(clf, X, y, cv, rstate):
# custom function to calculate classification metrics
- # eliminated need to calculate each metric separately using cross_val_score function
+ # eliminates need to calculate each metric using cross_val_score
# returns: a 1D list of accuracy, kappa and auc scores
# returns: mean precision/recall and std precision/recall per class
@@ -578,7 +607,9 @@
}
# generate Kfold indices
- k_fold = cross_validation.StratifiedKFold(y, n_folds=cv, shuffle=False, random_state=rstate)
+ k_fold = cross_validation.StratifiedKFold(y, n_folds=cv,
+ shuffle=False,
+ random_state=rstate)
# dictionary of lists to store metrics
cmstats = {
@@ -602,22 +633,54 @@
y_pred = fit.predict(X_test)
# calculate metrics
- cmstats['accuracy'] = np.append(cmstats['accuracy'], metrics.accuracy_score(y_test, y_pred))
- cmstats['precision'] = np.append(cmstats['precision'], metrics.precision_score(y_test, y_pred, average='weighted'))
- cmstats['recall'] = np.append(cmstats['recall'], metrics.recall_score(y_test, y_pred, average='weighted'))
- cmstats['f1'] = np.append(cmstats['f1'], metrics.f1_score(y_test, y_pred, average='weighted'))
- cmstats['kappa'] = np.append(cmstats['kappa'], metrics.cohen_kappa_score(y_test, y_pred))
+ cmstats['accuracy'] = np.append(cmstats['accuracy'],
+ metrics.accuracy_score(y_test, y_pred))
+ cmstats['precision'] = np.append(cmstats['precision'],
+ metrics.precision_score(y_test,
+ y_pred,
+ average='weighted'))
+
+
+ cmstats['recall'] = np.append(cmstats['recall'],
+ metrics.recall_score(y_test, y_pred,
+ average='weighted'))
+
+ cmstats['f1'] = np.append(cmstats['f1'],
+ metrics.f1_score(y_test, y_pred,
+ average='weighted'))
+
+ cmstats['kappa'] = np.append(cmstats['kappa'],
+ metrics.cohen_kappa_score(y_test, y_pred))
+
# test for if binary and classes equal 0,1
if len(np.unique(y)) == 2 and all ([0,1] == np.unique(y)):
+
+ stat_method = "binary"
+
y_pred_proba = fit.predict_proba(X_test)[:,1]
- cmstats['auc'] = np.append(cmstats['auc'], metrics.roc_auc_score(y_test, y_pred_proba))
+ cmstats['auc'] = np.append(cmstats['auc'],
+ metrics.roc_auc_score(y_test,
+ y_pred_proba))
+ else:
+ stat_method = "micro"
# Get performance measures by class
for cindex in range(nclasses):
- byclass_metrics['recall'][cindex, fold] = metrics.recall_score(y_test, y_pred, labels = [class_list[cindex]], pos_label=class_list[cindex])
- byclass_metrics['precision'][cindex, fold] = metrics.precision_score(y_test, y_pred, labels = [class_list[cindex]], pos_label=class_list[cindex])
- byclass_metrics['f1'][cindex, fold] = metrics.f1_score(y_test, y_pred, labels = [class_list[cindex]], pos_label=class_list[cindex])
+ byclass_metrics['recall'][cindex, fold] = \
+ metrics.recall_score(y_test, y_pred, labels =
+ [class_list[cindex]],
+ pos_label=class_list[cindex], average = stat_method)
+
+ byclass_metrics['precision'][cindex, fold] = \
+ metrics.precision_score(y_test, y_pred, labels =
+ [class_list[cindex]],
+ pos_label=class_list[cindex], average = stat_method)
+
+ byclass_metrics['f1'][cindex, fold] = \
+ metrics.f1_score(y_test, y_pred, labels =
+ [class_list[cindex]],
+ pos_label=class_list[cindex], average = stat_method)
fold+=1
return(cmstats, byclass_metrics)
@@ -657,6 +720,7 @@
importances = flags['f']
weighting = flags['b']
lowmem = flags['l']
+ ncores = int(options['ncores'])
# logistic regression
c_lr = float(options['c_lr'])
@@ -692,7 +756,6 @@
or model == 'GaussianNB' \
or model == 'LinearDiscriminantAnalysis' \
or model == 'QuadraticDiscriminantAnalysis':
-
mode = 'classification'
else:
mode = 'regression'
@@ -755,7 +818,9 @@
"""
# fetch individual raster names from group
- groupmaps = im.group(group=igroup, flags="g", quiet = True, stdout_=PIPE).outputs.stdout
+ groupmaps = im.group(group=igroup, flags="g",
+ quiet = True, stdout_=PIPE).outputs.stdout
+
maplist = groupmaps.split(os.linesep)
maplist = maplist[0:len(maplist)-1]
n_features = len(maplist)
@@ -825,7 +890,7 @@
from sklearn.ensemble import GradientBoostingRegressor
classifiers = {
- 'LogisticRegression': LogisticRegression(C=c_lr, fit_intercept=fi, n_jobs=-1, class_weight=weighting),
+ 'LogisticRegression': LogisticRegression(C=c_lr, fit_intercept=fi, n_jobs=ncores, class_weight=weighting),
'DecisionTreeClassifier': DecisionTreeClassifier(splitter=splitter_dt,
max_features=m_features_dt,
min_samples_split=min_samples_split_dt,
@@ -844,9 +909,9 @@
max_features=m_features_rf,
min_samples_split=minsplit_rf,
random_state=randst,
- n_jobs=-1,
+ n_jobs=ncores,
class_weight=weighting),
- 'RandomForestRegressor': RandomForestRegressor(n_jobs=-1,
+ 'RandomForestRegressor': RandomForestRegressor(n_jobs=ncores,
n_estimators=ntrees_rf,
oob_score=False,
max_features=m_features_rf,
@@ -898,7 +963,8 @@
# If cv > 1 then use cross-validation to generate performance measures
if cv > 1:
- from sklearn.cross_validation import cross_val_predict
+ from sklearn.cross_validation import cross_val_predict, cross_val_score
+ from sklearn.metrics import classification_report
grass.message(_('\r\n'))
grass.message(_("Cross validation global performance measures......:"))
@@ -938,10 +1004,9 @@
grass.message(_(row))
else:
- r2 = cross_val_score(clf, X, y, cv=cv, scoring='r2', n_jobs=-1)
+ r2 = cross_val_score(clf, X, y, cv=cv, scoring='r2', n_jobs=ncores)
grass.message(_("R2:\t%0.2f\t+/-\t%0.2f" % (r2.mean(), r2.std() * 2)))
-
# diagnostics
if importances == True:
if (model == 'RandomForestClassifier' or
@@ -957,7 +1022,8 @@
grass.message(_("id" + "\t" + "Raster" + "\t" + "Importance"))
for i in range(len(clfimp)):
- grass.message(_(str(i) + "\t" + maplist[i] + "\t" + str(round(clfimp[i], 4))))
+ grass.message(_(str(i) + "\t" + maplist[i]
+ + "\t" + str(round(clfimp[i], 4))))
# save the model
if model_save != '':
More information about the grass-commit
mailing list