[GRASS-SVN] r68202 - grass-addons/grass7/raster/r.randomforest

Fri Apr 1 09:23:32 PDT 2016

Author: spawley
Date: 2016-04-01 09:23:32 -0700 (Fri, 01 Apr 2016)
New Revision: 68202

Modified:
   grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
minor update to r.randomforest

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================

--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-04-01 13:47:24 UTC (rev 68201)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-04-01 16:23:32 UTC (rev 68202)
@@ -91,7 +91,7 @@
 #% key: lines
 #% type: integer
 #% description: Processing block size in terms of number of rows
-#% answer: 100
+#% answer: 20
 #% required: yes
 #% guisection: Optional
 #%end
@@ -108,6 +108,12 @@
 #% guisection: Random Forest Options
 #%end
 
+#%flag
+#% key: m
+#% description: Build model only - do not perform prediction
+#% guisection: Random Forest Options
+#%end
+
 #%option G_OPT_F_OUTPUT
 #% key: savefile
 #% label: Save model from file
@@ -122,7 +128,15 @@
 #% guisection: Optional
 #%end
 
+#%option G_OPT_F_OUTPUT
+#% key: fimportance
+#% label: Save feature importance and accuracy to csv
+#% required: no
+#% guisection: Optional
+#%end
+
 # standard modules
+from __future__ import print_function
 import atexit, os, random, string, imp
 from grass.pygrass.raster import RasterRow
 from grass.pygrass.gis.region import Region
@@ -154,12 +168,11 @@
     for i in range(nbands): rasstack[i].close()
     mask_raster.close()
     roi_raster.close()
-    grass.run_command("g.remove", name = rfmask, flags = "f", type = "raster")
+    grass.run_command("g.remove", name=rfmask, flags="f", type="raster")
 
 def normalize_newlines(string):
-    # Windows and nix platforms have different line endings. DOS uses carriage return and
-    # line feed ("\r\n") as a line ending, which Unix uses just line feed ("\n").
-    # This script normalizes the line endings to unix format "\n"
+    # Windows uses carriage return and line feed ("\r\n") as a line ending
+    # Unix uses just line feed ("\n"). This function normalizes strings to "\n"
     import re
     return re.sub(r'(\r\n|\r|\n)', '\n', string)
 
@@ -170,6 +183,7 @@
     mode = options['mode']
     ntrees = options['ntrees']
     balanced = flags['b']
+    modelonly = flags['m']
     class_probabilities = flags['p']
     rowincr = int(options['lines'])
     mfeatures = int(options['mfeatures'])
@@ -177,12 +191,13 @@
     randst = int(options['randst'])
     model_save = options['savefile']
     model_load = options['loadfile']
+    fimportance = options['fimportance']
 
-    ##################### error checking for valid input parameters ########################################
+    ##################### error checking for valid input parameters ################################
     if mfeatures == -1:
         mfeatures = str('auto')
     if mfeatures == 0:
-        print("mfeatures must be greater than zero, or -1 which uses the sqrt(nfeatures).....exiting")
+        print("mfeatures must be greater than zero, or -1 which uses the sqrt(nfeatures)...exiting")
         exit()
     if minsplit == 0:
         print("minsplit must be greater than zero.....exiting")
@@ -201,13 +216,13 @@
         print("Cannot save and load a model at the same time")
         exit()
 
-    ######################  Fetch individual raster names from group ###################################
-    groupmaps = grass.read_command("i.group", group = igroup, flags = "g")
+    ######################  Fetch individual raster names from group ###############################
+    groupmaps = grass.read_command("i.group", group=igroup, flags="g")
     groupmaps = normalize_newlines(groupmaps)
     maplist = groupmaps.split('\n')
     maplist = maplist[0:len(maplist)-1]
 
-    ######################### Obtain information about GRASS rasters to be classified ##################
+    ######################### Obtain information about GRASS rasters to be classified ##############
 
     # Determine number of bands and then create a list of GRASS rasterrow objects
     global nbands
@@ -231,14 +246,18 @@
     # and not load all of the rasters into memory in a numpy array
     current = Region()
 
-    ########################### Create a imagery mask ###############################################
-    # The input rasters might have slightly different dimensions in terms of value and non-value pixels.
-    # We will use the GRASS r.series module to automatically create a mask by propagating the null values
+    ########################### Create a imagery mask ##############################################
+    # The input rasters might have different dimensions in terms of value and non-value pixels.
+    # We will use the r.series module to automatically create a mask by propagating the null values
 
     global rfmask
-    rfmask = 'tmp_' + ''.join([random.choice(string.ascii_letters + string.digits) for n in xrange(8)])
-    grass.run_command("r.series", output = rfmask, input = maplist, method = 'count', flags = 'n', overwrite=True)
 
+    rfmask = 'tmp_' + ''.join([random.choice(string.ascii_letters + string.digits) \
+    for n in xrange(8)])
+
+    grass.run_command("r.series", output=rfmask, input=maplist, method='count', flags='n', \
+    overwrite=True)
+
     global mask_raster
     mask_raster = RasterRow(rfmask)
     mask_raster.open('r')
@@ -252,46 +271,46 @@
     else:
         print("ROI raster does not exist.... exiting")
         exit()
-    
+
     # load the model
     if model_load != '':
         rf = joblib.load(model_load)
     else:
         # determine cell storage type of training roi raster
-        roi_type = grass.read_command("r.info", map = roi, flags = 'g')
+        roi_type = grass.read_command("r.info", map=roi, flags='g')
         roi_type = normalize_newlines(str(roi_type))
         roi_list = roi_type.split('\n')
         dtype = roi_list[9].split('=')[1]
-    
+
         # check if training rois are valid for classification and regression
         if mode == 'classification' and dtype != 'CELL':
             print ("Classification mode requires an integer CELL type training roi map.....exiting")
             exit()
-    
+
         # Count number of labelled pixels
-        roi_stats = str(grass.read_command("r.univar", flags=("g"), map = roi))
+        roi_stats = str(grass.read_command("r.univar", flags=("g"), map=roi))
         if os.name == "nt":
-            roi_stats = roi_stats[0:len(roi_stats)-2] # to remove the last line ending and return characters
+            roi_stats = roi_stats[0:len(roi_stats)-2]
             roi_stats = roi_stats.split('\r\n')[0]
         else:
-            roi_stats = roi_stats[0:len(roi_stats)-1] # to remove the last line ending and return characters
+            roi_stats = roi_stats[0:len(roi_stats)-1]
             roi_stats = roi_stats.split('\n')[0]
-    
+
         ncells = str(roi_stats).split('=')[1]
         nlabel_pixels = int(ncells)
-    
-        # Create a numpy array filled with zeros, with the dimensions of the number of columns in the region
+
+        # Create a zero numpy array with the dimensions of the number of columns in the region
         # and the number of bands plus an additional band to attach the labels
-        tindex=0
+        tindex = 0
         training_labels = []
         training_data = np.zeros((nlabel_pixels, nbands+1))
         training_data[:] = np.NAN
-    
+
         # Loop through each row of the raster, get the pixels from that row in the ROI raster
         # and check if any of those pixels are labelled (i.e. they are not nan).
-        # If labelled pixels are encountered, loop through each imagery band for that row and put the data into
-        # the img_row_band_np array. Also attach the label values onto the last column.
-    
+        # If labelled pixels are encountered, loop through each band for that row and put the data
+        # into the img_row_band_np array. Also attach the label values onto the last column
+
         for row in range(current.rows):
             roi_row_np = roi_raster[row]
             is_train = np.nonzero(roi_row_np > -2147483648)
@@ -302,21 +321,17 @@
                     imagerow_np = rasstack[band][row]
                     training_data[tindex : tindex+nlabels_in_row, band] = imagerow_np[is_train]
                 tindex = tindex + nlabels_in_row
-    
+
         # Determine the number of class labels using np.unique
         nclasses = len(np.unique(training_labels))
-    
+
         # attach training label values onto last dimension of numpy array
         training_data[0:nlabel_pixels, nbands] = training_labels
-    
-        # Remove nan rows from numpy array. Explanation: np.isnan(a) returns a similar array with True
-        # where NaN, False elsewhere. .any(axis=1) reduces an m*n array to n with an logical or operation
-        # on the whole rows, ~ inverts True/False and a[  ] chooses just the rows from the original array,
-        # which have True within the brackets.
-    
+
+        # Remove nan rows from numpy array
         training_data = training_data[~np.isnan(training_data).any(axis=1)]
-    
-        # Split the numpy array into training_labels and training_data arrays to pass to the classifier
+
+        # Split the numpy array into training_labels and training_data arrays
         training_labels = training_data[:, nbands]
         training_data = training_data[:, 0:nbands]
 
@@ -325,43 +340,59 @@
         if mode == 'classification':
             if balanced == True:
                 rf = RandomForestClassifier(n_jobs=-1, n_estimators=int(ntrees), oob_score=True, \
-                class_weight = 'balanced', max_features = mfeatures, min_samples_split = minsplit, random_state = randst)
+                class_weight='balanced', max_features=mfeatures, min_samples_split=minsplit, \
+                random_state=randst)
             else:
                 rf = RandomForestClassifier(n_jobs=-1, n_estimators=int(ntrees), oob_score=True, \
-                max_features = mfeatures, min_samples_split = minsplit, random_state = randst)
+                max_features=mfeatures, min_samples_split=minsplit, random_state=randst)
             rf = rf.fit(training_data, training_labels)
+            acc = float(rf.oob_score_)
             print('Our OOB prediction of accuracy is: {oob}%'.format(oob=rf.oob_score_ * 100))
         else:
             rf = RandomForestRegressor(n_jobs=-1, n_estimators=int(ntrees), oob_score=True, \
-            max_features = mfeatures, min_samples_split = minsplit, random_state = randst)
+            max_features=mfeatures, min_samples_split=minsplit, random_state=randst)
             rf = rf.fit(training_data, training_labels)
+            acc = rf.score(X=training_data, y=training_labels)
             print('Our coefficient of determination R^2 of the prediction is: {r2}%'.format \
-            (r2=rf.score(X = training_data, y=training_labels)))
-    
+            (r2=rf.score(X=training_data, y=training_labels)))
+
         # diagnostics
         rfimp = pd.DataFrame(rf.feature_importances_)
-        rfimp.insert(loc=0, column='Raster', value = maplist)
+        rfimp.insert(loc=0, column='Raster', value=maplist)
         rfimp.columns = ['Raster', 'Importance']
         print(rfimp)
-
+        
+        # save diagnostics to file        
+        if fimportance != '':
+            rfimp.to_csv(path_or_buf = fimportance)
+            fd = open(fimportance, 'a')
+            fd.write(str(rfimp.shape[0]) + ',' + 'Performance measure (OOB or R2)' + ',' + str(acc))
+            fd.close()
+        
         # save the model
         if model_save != '':
             joblib.dump(rf, model_save + ".pkl")
+        
+        if modelonly == True:
+            exit()
 
     ################################ Prediction on the rest of the raster stack ###################
-    # Create a np.array that can store each raster row for all of the bands, i.e. it is a long as the current columns,
-    # and as wide as the number of bands
-    # Loop through the raster, row-by-row and get the row values for each band, adding these to the img_np_row np.array,
-    # which adds rowincr rows together to pass to the classifier. Otherwise, row-by-row is too inefficient.
+    # 1. Create a np.array that can store each raster row for all of the bands
+    # 2. Loop through the raster, row-by-row and get the row values for each band
+    #    adding these to the img_np_row np.array,
+    #    which bundles rowincr rows together to pass to the classifier,
+    #    otherwise, row-by-row is too inefficient.
+    # 3. The scikit learn predict function expects a list of pixels, not an NxM matrix.
+    #    We therefore need to reshape each row matrix into a list.
+    #    The total matrix size = cols * nbands.
+    #    Therefore we can use the np.reshape function to convert the image into a list
+    #    with the number of rows = n_samples, and the number of columns = the number of bands.
+    # 4. Then we remove any NaN values because the scikit-learn predict function cannot handle NaNs.
+    #    Here we replace them with a small value using the np.nan_to_num function.
+    # 5. The flat_pixels is then passed onto the prediction function.
+    # 6. After the prediction is performed on the row, to save keeping
+    #    anything in memory, we save it to a GRASS raster object, row-by-row.
 
-    # The scikit learn predict function expects a list of pixels, not an NxM matrix. We therefore need to reshape each row
-    # matrix into a list. The total matrix size = cols * nbands. Therefore we can use the np.reshape function to convert
-    # the image into a list with the number of rows equal to n_samples, and the number of columns equal to the number of bands.
-    # Then we remove any NaN values because the scikit-learn predict function cannot handle NaNs. Here we replace them with a
-    # small value using the np.nan_to_num function.
-    # The flat_pixels is then passed onto the prediction function. After the prediction is performed on the row, to save keeping
-    # anything in memory, we save it to a GRASS raster object, row-by-row.
-
     classification = RasterRow(output)
     if mode == 'classification':
         ftype = 'CELL'
@@ -369,7 +400,7 @@
     else:
         ftype = 'FCELL'
         nodata = np.nan
-    classification.open('w', ftype,  overwrite = True)
+    classification.open('w', ftype, overwrite=True)
 
     # create and open RasterRow objects for classification and probabilities if enabled
     if class_probabilities == True and mode == 'classification':
@@ -378,7 +409,7 @@
         for iclass in range(nclasses):
             prob_out_raster[iclass] = output + '_p' + str(iclass)
             prob[iclass] = RasterRow(prob_out_raster[iclass])
-            prob[iclass].open('w', 'FCELL',  overwrite = True)
+            prob[iclass].open('w', 'FCELL', overwrite=True)
 
     for rowblock in range(0, current.rows, rowincr):
         # check that the row increment does not exceed the number of rows
@@ -406,8 +437,10 @@
 
         # replace NaN values so that the prediction surface does not have a border
         result_NaN = np.ma.masked_array(result, mask=nanmask, fill_value=np.nan)
-        result_masked = result_NaN.filled([nodata]) #Return a copy of result, with masked values filled with a given value
 
+        # return a copy of result, with masked values filled with a given value
+        result_masked = result_NaN.filled([nodata])
+
         # for each row we can perform computation, and write the result into
         for row in range(rowincr):
             newrow = Buffer((result_masked.shape[1],), mtype=ftype)
@@ -421,7 +454,7 @@
                 result_proba_class = result_proba[:, iclass]
                 result_proba_class = result_proba_class.reshape((rowincr, current.cols))
                 result_proba_class_NaN = np.ma.masked_array(result_proba_class, mask=nanmask, fill_value=np.nan)
-                result_proba_class_masked = result_proba_class_NaN.filled([np.nan]) #Return a copy of result, with masked values filled with a given value
+                result_proba_class_masked = result_proba_class_NaN.filled([np.nan])
                 for row in range(rowincr):
                     newrow = Buffer((result_proba_class_masked.shape[1],), mtype='FCELL')
                     newrow[:] = result_proba_class_masked[row, :]