[GRASS-SVN] r68202 - grass-addons/grass7/raster/r.randomforest
svn_grass at osgeo.org
svn_grass at osgeo.org
Fri Apr 1 09:23:32 PDT 2016
Author: spawley
Date: 2016-04-01 09:23:32 -0700 (Fri, 01 Apr 2016)
New Revision: 68202
Modified:
grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
minor update to r.randomforest
Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-04-01 13:47:24 UTC (rev 68201)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-04-01 16:23:32 UTC (rev 68202)
@@ -91,7 +91,7 @@
#% key: lines
#% type: integer
#% description: Processing block size in terms of number of rows
-#% answer: 100
+#% answer: 20
#% required: yes
#% guisection: Optional
#%end
@@ -108,6 +108,12 @@
#% guisection: Random Forest Options
#%end
+#%flag
+#% key: m
+#% description: Build model only - do not perform prediction
+#% guisection: Random Forest Options
+#%end
+
#%option G_OPT_F_OUTPUT
#% key: savefile
#% label: Save model from file
@@ -122,7 +128,15 @@
#% guisection: Optional
#%end
+#%option G_OPT_F_OUTPUT
+#% key: fimportance
+#% label: Save feature importance and accuracy to csv
+#% required: no
+#% guisection: Optional
+#%end
+
# standard modules
+from __future__ import print_function
import atexit, os, random, string, imp
from grass.pygrass.raster import RasterRow
from grass.pygrass.gis.region import Region
@@ -154,12 +168,11 @@
for i in range(nbands): rasstack[i].close()
mask_raster.close()
roi_raster.close()
- grass.run_command("g.remove", name = rfmask, flags = "f", type = "raster")
+ grass.run_command("g.remove", name=rfmask, flags="f", type="raster")
def normalize_newlines(string):
- # Windows and nix platforms have different line endings. DOS uses carriage return and
- # line feed ("\r\n") as a line ending, which Unix uses just line feed ("\n").
- # This script normalizes the line endings to unix format "\n"
+ # Windows uses carriage return and line feed ("\r\n") as a line ending
+ # Unix uses just line feed ("\n"). This function normalizes strings to "\n"
import re
return re.sub(r'(\r\n|\r|\n)', '\n', string)
@@ -170,6 +183,7 @@
mode = options['mode']
ntrees = options['ntrees']
balanced = flags['b']
+ modelonly = flags['m']
class_probabilities = flags['p']
rowincr = int(options['lines'])
mfeatures = int(options['mfeatures'])
@@ -177,12 +191,13 @@
randst = int(options['randst'])
model_save = options['savefile']
model_load = options['loadfile']
+ fimportance = options['fimportance']
- ##################### error checking for valid input parameters ########################################
+ ##################### error checking for valid input parameters ################################
if mfeatures == -1:
mfeatures = str('auto')
if mfeatures == 0:
- print("mfeatures must be greater than zero, or -1 which uses the sqrt(nfeatures).....exiting")
+ print("mfeatures must be greater than zero, or -1 which uses the sqrt(nfeatures)...exiting")
exit()
if minsplit == 0:
print("minsplit must be greater than zero.....exiting")
@@ -201,13 +216,13 @@
print("Cannot save and load a model at the same time")
exit()
- ###################### Fetch individual raster names from group ###################################
- groupmaps = grass.read_command("i.group", group = igroup, flags = "g")
+ ###################### Fetch individual raster names from group ###############################
+ groupmaps = grass.read_command("i.group", group=igroup, flags="g")
groupmaps = normalize_newlines(groupmaps)
maplist = groupmaps.split('\n')
maplist = maplist[0:len(maplist)-1]
- ######################### Obtain information about GRASS rasters to be classified ##################
+ ######################### Obtain information about GRASS rasters to be classified ##############
# Determine number of bands and then create a list of GRASS rasterrow objects
global nbands
@@ -231,14 +246,18 @@
# and not load all of the rasters into memory in a numpy array
current = Region()
- ########################### Create a imagery mask ###############################################
- # The input rasters might have slightly different dimensions in terms of value and non-value pixels.
- # We will use the GRASS r.series module to automatically create a mask by propagating the null values
+ ########################### Create a imagery mask ##############################################
+ # The input rasters might have different dimensions in terms of value and non-value pixels.
+ # We will use the r.series module to automatically create a mask by propagating the null values
global rfmask
- rfmask = 'tmp_' + ''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(8)])
- grass.run_command("r.series", output = rfmask, input = maplist, method = 'count', flags = 'n', overwrite=True)
+ rfmask = 'tmp_' + ''.join([random.choice(string.ascii_letters + string.digits) \
+ for n in xrange(8)])
+
+ grass.run_command("r.series", output=rfmask, input=maplist, method='count', flags='n', \
+ overwrite=True)
+
global mask_raster
mask_raster = RasterRow(rfmask)
mask_raster.open('r')
@@ -252,46 +271,46 @@
else:
print("ROI raster does not exist.... exiting")
exit()
-
+
# load the model
if model_load != '':
rf = joblib.load(model_load)
else:
# determine cell storage type of training roi raster
- roi_type = grass.read_command("r.info", map = roi, flags = 'g')
+ roi_type = grass.read_command("r.info", map=roi, flags='g')
roi_type = normalize_newlines(str(roi_type))
roi_list = roi_type.split('\n')
dtype = roi_list[9].split('=')[1]
-
+
# check if training rois are valid for classification and regression
if mode == 'classification' and dtype != 'CELL':
print ("Classification mode requires an integer CELL type training roi map.....exiting")
exit()
-
+
# Count number of labelled pixels
- roi_stats = str(grass.read_command("r.univar", flags=("g"), map = roi))
+ roi_stats = str(grass.read_command("r.univar", flags=("g"), map=roi))
if os.name == "nt":
- roi_stats = roi_stats[0:len(roi_stats)-2] # to remove the last line ending and return characters
+ roi_stats = roi_stats[0:len(roi_stats)-2]
roi_stats = roi_stats.split('\r\n')[0]
else:
- roi_stats = roi_stats[0:len(roi_stats)-1] # to remove the last line ending and return characters
+ roi_stats = roi_stats[0:len(roi_stats)-1]
roi_stats = roi_stats.split('\n')[0]
-
+
ncells = str(roi_stats).split('=')[1]
nlabel_pixels = int(ncells)
-
- # Create a numpy array filled with zeros, with the dimensions of the number of columns in the region
+
+ # Create a zero numpy array with the dimensions of the number of columns in the region
# and the number of bands plus an additional band to attach the labels
- tindex=0
+ tindex = 0
training_labels = []
training_data = np.zeros((nlabel_pixels, nbands+1))
training_data[:] = np.NAN
-
+
# Loop through each row of the raster, get the pixels from that row in the ROI raster
# and check if any of those pixels are labelled (i.e. they are not nan).
- # If labelled pixels are encountered, loop through each imagery band for that row and put the data into
- # the img_row_band_np array. Also attach the label values onto the last column.
-
+ # If labelled pixels are encountered, loop through each band for that row and put the data
+ # into the img_row_band_np array. Also attach the label values onto the last column
+
for row in range(current.rows):
roi_row_np = roi_raster[row]
is_train = np.nonzero(roi_row_np > -2147483648)
@@ -302,21 +321,17 @@
imagerow_np = rasstack[band][row]
training_data[tindex : tindex+nlabels_in_row, band] = imagerow_np[is_train]
tindex = tindex + nlabels_in_row
-
+
# Determine the number of class labels using np.unique
nclasses = len(np.unique(training_labels))
-
+
# attach training label values onto last dimension of numpy array
training_data[0:nlabel_pixels, nbands] = training_labels
-
- # Remove nan rows from numpy array. Explanation: np.isnan(a) returns a similar array with True
- # where NaN, False elsewhere. .any(axis=1) reduces an m*n array to n with an logical or operation
- # on the whole rows, ~ inverts True/False and a[ ] chooses just the rows from the original array,
- # which have True within the brackets.
-
+
+ # Remove nan rows from numpy array
training_data = training_data[~np.isnan(training_data).any(axis=1)]
-
- # Split the numpy array into training_labels and training_data arrays to pass to the classifier
+
+ # Split the numpy array into training_labels and training_data arrays
training_labels = training_data[:, nbands]
training_data = training_data[:, 0:nbands]
@@ -325,43 +340,59 @@
if mode == 'classification':
if balanced == True:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=int(ntrees), oob_score=True, \
- class_weight = 'balanced', max_features = mfeatures, min_samples_split = minsplit, random_state = randst)
+ class_weight='balanced', max_features=mfeatures, min_samples_split=minsplit, \
+ random_state=randst)
else:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=int(ntrees), oob_score=True, \
- max_features = mfeatures, min_samples_split = minsplit, random_state = randst)
+ max_features=mfeatures, min_samples_split=minsplit, random_state=randst)
rf = rf.fit(training_data, training_labels)
+ acc = float(rf.oob_score_)
print('Our OOB prediction of accuracy is: {oob}%'.format(oob=rf.oob_score_ * 100))
else:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=int(ntrees), oob_score=True, \
- max_features = mfeatures, min_samples_split = minsplit, random_state = randst)
+ max_features=mfeatures, min_samples_split=minsplit, random_state=randst)
rf = rf.fit(training_data, training_labels)
+ acc = rf.score(X=training_data, y=training_labels)
print('Our coefficient of determination R^2 of the prediction is: {r2}%'.format \
- (r2=rf.score(X = training_data, y=training_labels)))
-
+ (r2=rf.score(X=training_data, y=training_labels)))
+
# diagnostics
rfimp = pd.DataFrame(rf.feature_importances_)
- rfimp.insert(loc=0, column='Raster', value = maplist)
+ rfimp.insert(loc=0, column='Raster', value=maplist)
rfimp.columns = ['Raster', 'Importance']
print(rfimp)
-
+
+ # save diagnostics to file
+ if fimportance != '':
+ rfimp.to_csv(path_or_buf = fimportance)
+ fd = open(fimportance, 'a')
+ fd.write(str(rfimp.shape[0]) + ',' + 'Performance measure (OOB or R2)' + ',' + str(acc))
+ fd.close()
+
# save the model
if model_save != '':
joblib.dump(rf, model_save + ".pkl")
+
+ if modelonly == True:
+ exit()
################################ Prediction on the rest of the raster stack ###################
- # Create a np.array that can store each raster row for all of the bands, i.e. it is a long as the current columns,
- # and as wide as the number of bands
- # Loop through the raster, row-by-row and get the row values for each band, adding these to the img_np_row np.array,
- # which adds rowincr rows together to pass to the classifier. Otherwise, row-by-row is too inefficient.
+ # 1. Create a np.array that can store each raster row for all of the bands
+ # 2. Loop through the raster, row-by-row and get the row values for each band
+ # adding these to the img_np_row np.array,
+ # which bundles rowincr rows together to pass to the classifier,
+ # otherwise, row-by-row is too inefficient.
+ # 3. The scikit learn predict function expects a list of pixels, not an NxM matrix.
+ # We therefore need to reshape each row matrix into a list.
+ # The total matrix size = cols * nbands.
+ # Therefore we can use the np.reshape function to convert the image into a list
+ # with the number of rows = n_samples, and the number of columns = the number of bands.
+ # 4. Then we remove any NaN values because the scikit-learn predict function cannot handle NaNs.
+ # Here we replace them with a small value using the np.nan_to_num function.
+ # 5. The flat_pixels is then passed onto the prediction function.
+ # 6. After the prediction is performed on the row, to save keeping
+ # anything in memory, we save it to a GRASS raster object, row-by-row.
- # The scikit learn predict function expects a list of pixels, not an NxM matrix. We therefore need to reshape each row
- # matrix into a list. The total matrix size = cols * nbands. Therefore we can use the np.reshape function to convert
- # the image into a list with the number of rows equal to n_samples, and the number of columns equal to the number of bands.
- # Then we remove any NaN values because the scikit-learn predict function cannot handle NaNs. Here we replace them with a
- # small value using the np.nan_to_num function.
- # The flat_pixels is then passed onto the prediction function. After the prediction is performed on the row, to save keeping
- # anything in memory, we save it to a GRASS raster object, row-by-row.
-
classification = RasterRow(output)
if mode == 'classification':
ftype = 'CELL'
@@ -369,7 +400,7 @@
else:
ftype = 'FCELL'
nodata = np.nan
- classification.open('w', ftype, overwrite = True)
+ classification.open('w', ftype, overwrite=True)
# create and open RasterRow objects for classification and probabilities if enabled
if class_probabilities == True and mode == 'classification':
@@ -378,7 +409,7 @@
for iclass in range(nclasses):
prob_out_raster[iclass] = output + '_p' + str(iclass)
prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite = True)
+ prob[iclass].open('w', 'FCELL', overwrite=True)
for rowblock in range(0, current.rows, rowincr):
# check that the row increment does not exceed the number of rows
@@ -406,8 +437,10 @@
# replace NaN values so that the prediction surface does not have a border
result_NaN = np.ma.masked_array(result, mask=nanmask, fill_value=np.nan)
- result_masked = result_NaN.filled([nodata]) #Return a copy of result, with masked values filled with a given value
+ # return a copy of result, with masked values filled with a given value
+ result_masked = result_NaN.filled([nodata])
+
# for each row we can perform computation, and write the result into
for row in range(rowincr):
newrow = Buffer((result_masked.shape[1],), mtype=ftype)
@@ -421,7 +454,7 @@
result_proba_class = result_proba[:, iclass]
result_proba_class = result_proba_class.reshape((rowincr, current.cols))
result_proba_class_NaN = np.ma.masked_array(result_proba_class, mask=nanmask, fill_value=np.nan)
- result_proba_class_masked = result_proba_class_NaN.filled([np.nan]) #Return a copy of result, with masked values filled with a given value
+ result_proba_class_masked = result_proba_class_NaN.filled([np.nan])
for row in range(rowincr):
newrow = Buffer((result_proba_class_masked.shape[1],), mtype='FCELL')
newrow[:] = result_proba_class_masked[row, :]
More information about the grass-commit
mailing list