[GRASS-SVN] r71016 - grass-addons/grass7/raster/r.learn.ml

Wed May 3 13:52:06 PDT 2017

Author: spawley
Date: 2017-05-03 13:52:05 -0700 (Wed, 03 May 2017)
New Revision: 71016

Modified:
   grass-addons/grass7/raster/r.learn.ml/r.learn.ml.html
   grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
   grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
Log:
r.learn.ml updated raster prediction to use multiprocessing

Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.html
===================================================================

--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.html	2017-05-03 15:42:48 UTC (rev 71015)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.html	2017-05-03 20:52:05 UTC (rev 71016)
@@ -78,7 +78,7 @@
 <p>Then we can use these training pixels to perform a classification on the more recently obtained landsat 7 image:</p>
 <div class="code"><pre>
 r.learn.ml group=lsat7_2000 trainingmap=landclass96_roi output=rf_classification \
-  classifier=RandomForestClassifier n_estimators=500 max_features=-1 min_samples_split=2 randst=1 lines=25
+  classifier=RandomForestClassifier n_estimators=500 randst=1 lines=25
 
 # copy category labels from landclass training map to result
 r.category rf_classification raster=landclass96_roi

Modified: grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py	2017-05-03 15:42:48 UTC (rev 71015)
+++ grass-addons/grass7/raster/r.learn.ml/r.learn.ml.py	2017-05-03 20:52:05 UTC (rev 71016)
@@ -889,27 +889,27 @@
         if prob_only is False:
             gscript.message('Predicting classification/regression raster...')
             predict(estimator=clf, predictors=maplist, output=output,
-                    predict_type='raw', rowincr=rowincr)
+                    predict_type='raw', rowincr=rowincr, n_jobs=n_jobs)
 
             if predict_resamples is True:
                 for i in range(cv):
                     resample_name = output + '_Resample' + str(i)
                     predict(estimator=models[i], predictors=maplist,
                             output=resample_name, predict_type='raw',
-                            rowincr=rowincr)
+                            rowincr=rowincr, n_jobs=n_jobs)
 
         # predict class probabilities
         if probability is True:
             gscript.message('Predicting class probabilities...')
             predict(estimator=clf, predictors=maplist, output=output, predict_type='prob',
-                    index=indexes, rowincr=rowincr)
+                    index=indexes, rowincr=rowincr, n_jobs=n_jobs)
 
             if predict_resamples is True:
                 for i in range(cv):
                     resample_name = output + '_Resample' + str(i)
                     predict(estimator=models[i], predictors=maplist,
                             output=resample_name, predict_type='prob',
-                            index=indexes, rowincr=rowincr)
+                            index=indexes, rowincr=rowincr, n_jobs=n_jobs)
     else:
         gscript.message("Model built and now exiting")
 

Modified: grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py
===================================================================
--- grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py	2017-05-03 15:42:48 UTC (rev 71015)
+++ grass-addons/grass7/raster/r.learn.ml/rlearn_rasters.py	2017-05-03 20:52:05 UTC (rev 71016)
@@ -166,7 +166,7 @@
 
 
 def predict(estimator, predictors, output, predict_type='raw',
-            index=None, rowincr=25):
+            index=None, rowincr=25, n_jobs=-2):
     """
     Prediction on list of GRASS rasters using a fitted scikit learn model
 
@@ -179,8 +179,11 @@
                   'prob' for class probabilities
     index: Optional, list of class indices to export
     rowincr: Integer of raster rows to process at one time
+    n_jobs: Number of processing cores
     """
 
+    from sklearn.externals.joblib import Parallel, delayed
+
     # convert potential single index to list
     if isinstance(index, int): index = [index]
 
@@ -198,92 +201,69 @@
                           " does not exist.... exiting")
 
     # -------------------------------------------------------------------------
-    # Prediction using blocks of rows per iteration
+    # parallel prediction
     # -------------------------------------------------------------------------
 
-    for rowblock in range(0, current.rows, rowincr):
-        gscript.percent(rowblock, current.rows, rowincr)
+    # create lists of row increments
+    row_mins, row_maxs = [], []
+    for row in range(0, current.rows, rowincr):
+        if row+rowincr > current.rows:
+            rowincr = current.rows - row
+        row_mins.append(row)
+        row_maxs.append(row+rowincr)
 
-        # check that the row increment does not exceed the number of rows
-        if rowblock+rowincr > current.rows:
-            rowincr = current.rows - rowblock
-        img_np_row = np.zeros((rowincr, current.cols, n_features))
+    # perform predictions on lists of row increments in parallel
+    predictions = Parallel(n_jobs=n_jobs)(
+        delayed(__predict_parallel)
+        (estimator, predictors, predict_type, current, row_min, row_max)
+        for row_min, row_max in zip(row_mins, row_maxs))
 
-        # loop through each row, and each band and add to 2D img_np_row
-        for row in range(rowblock, rowblock+rowincr, 1):
-            for band in range(n_features):
-                img_np_row[row-rowblock, :, band] = \
-                    np.array(rasstack[band][row])
+    # unpack the results
+    results = []
+    ftypes = []
+    for block in predictions:
+        results.append(block[0])
+        ftypes.append(block[1])
 
-        # create mask
-        img_np_row[img_np_row == -2147483648] = np.nan
-        mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
-        for feature in range(n_features):
-            invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
-            mask[invalid_indexes] = np.nan
+    # -------------------------------------------------------------------------
+    #  writing of predicted results for classification
+    # -------------------------------------------------------------------------
+    if predict_type == 'raw':
+        classification = RasterRow(output)
+        classification.open('w', ftypes[0], overwrite=True)
 
-        # reshape each row-band matrix into a n*m array
-        nsamples = rowincr * current.cols
-        flat_pixels = img_np_row.reshape((nsamples, n_features))
-
-        # remove NaNs prior to passing to scikit-learn predict
-        flat_pixels = np.nan_to_num(flat_pixels)
-
-        # perform prediction for classification/regression
-        if predict_type == 'raw':
-            result = estimator.predict(flat_pixels)
-            result = result.reshape((rowincr, current.cols))
-
-            # determine nodata value and grass raster type
-            if result.dtype == 'float':
-                nodata = np.nan
-                ftype = 'FCELL'
-            else:
-                nodata = -2147483648
-                ftype = 'CELL'
-
-            # replace NaN values so that the prediction does not have a border
-            result[np.nonzero(np.isnan(mask))] = nodata
-
-            # on first iteration create the RasterRow object
-            if rowblock == 0:
-                if predict_type == 'raw':
-                    classification = RasterRow(output)
-                    classification.open('w', ftype, overwrite=True)
-
-            # write the classification result
-            for row in range(rowincr):
-                newrow = Buffer((result.shape[1],), mtype=ftype)
-                newrow[:] = result[row, :]
+        # write the classification result
+        for result_block in results:
+            for row in range(result_block.shape[0]):
+                newrow = Buffer((result_block.shape[1],), mtype=ftypes[0])
+                newrow[:] = result_block[row, :]
                 classification.put_row(newrow)
 
-        # perform prediction for class probabilities
-        if predict_type == 'prob':
-            result_proba = estimator.predict_proba(flat_pixels)
+    # -------------------------------------------------------------------------
+    # writing of predicted results for probabilities
+    # -------------------------------------------------------------------------
+    if predict_type == 'prob':
+        # determine number of classes
+        if index is None:
+            index = range(results[0].shape[2])
+            n_classes = len(index)
+        else:
+            n_classes = len(np.unique(index))
 
-            # on first loop determine number of probability classes
-            # and open rasterrow objects for writing
-            if rowblock == 0:
-                if index is None:
-                    index = range(result_proba.shape[1])
-                    n_classes = len(index)
-                else:
-                    n_classes = len(np.unique(index))
+        # create and open RasterRow objects for probabilities
+        prob_out_raster = [0] * n_classes
+        prob = [0] * n_classes
+        for iclass, label in enumerate(index):
+            prob_out_raster[iclass] = output + '_classPr' + str(label)
+            prob[iclass] = RasterRow(prob_out_raster[iclass])
+            prob[iclass].open('w', 'FCELL', overwrite=True)
 
-                # create and open RasterRow objects for probabilities
-                prob_out_raster = [0] * n_classes
-                prob = [0] * n_classes
-                for iclass, label in enumerate(index):
-                    prob_out_raster[iclass] = output + '_classPr' + str(label)
-                    prob[iclass] = RasterRow(prob_out_raster[iclass])
-                    prob[iclass].open('w', 'FCELL', overwrite=True)
-
+        # write the class probability results
+        for results_proba_block in results:
             for iclass, label in enumerate(index):
-                result_proba_class = result_proba[:, label]
-                result_proba_class = result_proba_class.reshape((rowincr, current.cols))
-                result_proba_class[np.nonzero(np.isnan(mask))] = np.nan
+                result_proba_class = results_proba_block[:, :, label]
 
-                for row in range(rowincr):
+                for row in range(result_proba_class.shape[0]):
                     newrow = Buffer((result_proba_class.shape[1],), mtype='FCELL')
                     newrow[:] = result_proba_class[row, :]
                     prob[iclass].put_row(newrow)
@@ -291,11 +271,94 @@
     # -------------------------------------------------------------------------
     # close all maps
     # -------------------------------------------------------------------------
-    for i in range(n_features): rasstack[i].close()
     if predict_type == 'raw': classification.close()
     if predict_type == 'prob':
         try:
             for iclass in range(n_classes):
                 prob[iclass].close()
         except:
-            pass
\ No newline at end of file
+            pass
+
+
+def __predict_parallel(estimator, predictors, predict_type, current, row_min, row_max):
+    """
+    Performs prediction on range of rows in grass rasters
+
+    Args
+    ----
+    estimator: scikit-learn estimator object
+    predictors: list of GRASS rasters
+    predict_type: character, 'raw' for classification/regression;
+                  'prob' for class probabilities
+    current: current region settings
+    row_min, row_max: Range of rows of grass rasters to perform predictions
+
+    Returns
+    -------
+    result: 2D (classification) or 3D numpy array (class probabilities) of predictions
+    ftypes: data storage type
+    """
+
+    # initialize output
+    result, ftype, mask = None, None, None
+
+    # open grass rasters
+    n_features = len(predictors)
+    rasstack = [0] * n_features
+
+    for i in range(n_features):
+        rasstack[i] = RasterRow(predictors[i])
+        if rasstack[i].exist() is True:
+            rasstack[i].open('r')
+        else:
+            gscript.fatal("GRASS raster " + predictors[i] +
+                          " does not exist.... exiting")
+
+    # loop through each row, and each band and add to 2D img_np_row
+    img_np_row = np.zeros((row_max-row_min, current.cols, n_features))
+    for row in range(row_min, row_max):
+        for band in range(n_features):
+            img_np_row[row-row_min, :, band] = np.array(rasstack[band][row])
+
+    # create mask
+    img_np_row[img_np_row == -2147483648] = np.nan
+    mask = np.zeros((img_np_row.shape[0], img_np_row.shape[1]))
+    for feature in range(n_features):
+        invalid_indexes = np.nonzero(np.isnan(img_np_row[:, :, feature]))
+        mask[invalid_indexes] = np.nan
+
+    # reshape each row-band matrix into a n*m array
+    nsamples = (row_max-row_min) * current.cols
+    flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+    # remove NaNs prior to passing to scikit-learn predict
+    flat_pixels = np.nan_to_num(flat_pixels)
+
+    # perform prediction for classification/regression
+    if predict_type == 'raw':
+        result = estimator.predict(flat_pixels)
+        result = result.reshape((row_max-row_min, current.cols))
+
+        # determine nodata value and grass raster type
+        if result.dtype == 'float':
+            nodata = np.nan
+            ftype = 'FCELL'
+        else:
+            nodata = -2147483648
+            ftype = 'CELL'
+
+        # replace NaN values so that the prediction does not have a border
+        result[np.nonzero(np.isnan(mask))] = nodata
+
+    # perform prediction for class probabilities
+    if predict_type == 'prob':
+        result = estimator.predict_proba(flat_pixels)
+        result = result.reshape((row_max-row_min, current.cols, result.shape[1]))
+        result[np.nonzero(np.isnan(mask))] = np.nan
+
+    # close maps
+    for i in range(n_features):
+        rasstack[i].close()
+
+    return (result, ftype)
+