[GRASS-SVN] r70075 - grass-addons/grass7/raster/r.randomforest

Wed Dec 14 09:56:10 PST 2016

Author: spawley
Date: 2016-12-14 09:56:10 -0800 (Wed, 14 Dec 2016)
New Revision: 70075

Modified:
   grass-addons/grass7/raster/r.randomforest/Makefile
   grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
'r.randomforest refactoring'

Modified: grass-addons/grass7/raster/r.randomforest/Makefile
===================================================================

--- grass-addons/grass7/raster/r.randomforest/Makefile	2016-12-14 05:30:02 UTC (rev 70074)
+++ grass-addons/grass7/raster/r.randomforest/Makefile	2016-12-14 17:56:10 UTC (rev 70075)
@@ -2,9 +2,6 @@
 
 PGM = r.randomforest
 
-ETCFILES = ml_classifiers ml_utils
-
 include $(MODULE_TOPDIR)/include/Make/Script.make
-include $(MODULE_TOPDIR)/include/Make/Python.make
 
 default: script

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-12-14 05:30:02 UTC (rev 70074)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-12-14 17:56:10 UTC (rev 70075)
@@ -247,20 +247,19 @@
 #% exclusive: save_training,load_training
 #%end
 
-# import standard modules
 import atexit
 import os
 import numpy as np
-from subprocess import PIPE
-
 import grass.script as grass
+import tempfile
+import copy
 from grass.pygrass.modules.shortcuts import imagery as im
+from subprocess import PIPE
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.modules.shortcuts import raster as r
 
-from grass.pygrass.utils import set_path
-set_path('r.randomforest')
-from ml_utils import *
-from ml_classifiers import model_classifiers
-
 try:
     import pandas as pd
 except:
@@ -271,11 +270,28 @@
     from sklearn.externals import joblib
     from sklearn import metrics
     from sklearn import preprocessing
+    from sklearn.model_selection import StratifiedKFold
+    from sklearn.model_selection import GroupKFold
+    from sklearn.model_selection import train_test_split
+    from sklearn.model_selection import GridSearchCV
+    from sklearn.feature_selection import SelectKBest
+    from sklearn.feature_selection import f_classif
+    from sklearn.utils import shuffle
+    from sklearn.cluster import KMeans
+    
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+    from sklearn.naive_bayes import GaussianNB
+    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+    from sklearn.ensemble import GradientBoostingClassifier
+    from sklearn.ensemble import GradientBoostingRegressor
+    from sklearn.svm import SVC
 
 except:
     grass.fatal("Scikit learn is not installed")
 
-# check for sklearn version
 if (sklearn.__version__) < 0.18:
     grass.fatal("Scikit learn 0.18 or newer is required")
 
@@ -288,6 +304,574 @@
                       flags="f", type="raster", quiet=True)
 
 
+def model_classifiers(estimator, random_state, class_weight,
+                      C, max_depth, max_features,
+                      min_samples_split, min_samples_leaf,
+                      n_estimators, subsample, learning_rate):
+    
+    classifiers = {
+        'SVC': SVC(C=C, probability=True, random_state=random_state),
+        'LogisticRegression': LogisticRegression(C=C, class_weight=class_weight,
+                                                 random_state=random_state),
+        'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=max_depth,
+                                                         max_features=max_features,
+                                                         min_samples_split=min_samples_split,
+                                                         min_samples_leaf=min_samples_leaf,
+                                                         random_state=random_state,
+                                                         class_weight=class_weight),
+        'DecisionTreeRegressor': DecisionTreeRegressor(max_features=max_features,
+                                                       min_samples_split=min_samples_split,
+                                                       min_samples_leaf=min_samples_leaf,
+                                                       random_state=random_state),
+        'RandomForestClassifier': RandomForestClassifier(n_estimators=n_estimators,
+                                                         class_weight=class_weight,
+                                                         max_features=max_features,
+                                                         min_samples_split=min_samples_split,
+                                                         random_state=random_state,
+                                                         n_jobs=-1,
+                                                         oob_score=True),
+        'RandomForestRegressor': RandomForestRegressor(n_estimators=n_estimators,
+                                                       max_features=max_features,
+                                                       min_samples_split=min_samples_split,
+                                                       random_state=random_state,
+                                                       n_jobs=-1,
+                                                       oob_score=True),
+        'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=learning_rate,
+                                                                 n_estimators=n_estimators,
+                                                                 max_depth=max_depth,
+                                                                 min_samples_split=min_samples_split,
+                                                                 min_samples_leaf=min_samples_leaf,
+                                                                 subsample=subsample,
+                                                                 max_features=max_features,
+                                                                 random_state=random_state),
+        'GradientBoostingRegressor': GradientBoostingRegressor(learning_rate=learning_rate,
+                                                               n_estimators=n_estimators,
+                                                               max_depth=max_depth,
+                                                               min_samples_split=min_samples_split,
+                                                               min_samples_leaf=min_samples_leaf,
+                                                               subsample=subsample,
+                                                               max_features=max_features,
+                                                               random_state=random_state),
+        'GaussianNB': GaussianNB(),
+        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+    }
+
+    SVCOpts = {'C': [1, 10, 100], 'shrinking': [True, False]}
+    LogisticRegressionOpts = {'C': [1, 10, 100, 1000]}
+    DecisionTreeOpts = {'max_depth': [2, 4, 6, 8, 10, 20],
+                        'max_features': ['sqrt', 'log2', None],
+                        'min_samples_split': [2, 0.01, 0.05, 0.1, 0.25]}
+    RandomForestOpts = {'max_features': ['sqrt', 'log2', None]}
+    GradientBoostingOpts = {'learning_rate': [0.01, 0.02, 0.05, 0.1],
+                            'max_depth': [3, 4, 6, 10],
+                            'max_features': ['sqrt', 'log2', None],
+                            'n_estimators': [50, 100, 150, 250, 500]}
+
+    param_grids = {
+        'SVC': SVCOpts,
+        'LogisticRegression': LogisticRegressionOpts,
+        'DecisionTreeClassifier': DecisionTreeOpts,
+        'DecisionTreeRegressor': DecisionTreeOpts,
+        'RandomForestClassifier': RandomForestOpts,
+        'RandomForestRegressor': RandomForestOpts,
+        'GradientBoostingClassifier': GradientBoostingOpts,
+        'GradientBoostingRegressor': GradientBoostingOpts,
+        'GaussianNB': {},
+        'LinearDiscriminantAnalysis': {},
+        'QuadraticDiscriminantAnalysis': {},
+    }
+    
+    # define classifier
+    clf = classifiers[estimator]
+    params = param_grids[estimator]
+    
+    # classification or regression
+    if estimator == 'LogisticRegression' \
+        or estimator == 'DecisionTreeClassifier' \
+        or estimator == 'RandomForestClassifier' \
+        or estimator == 'GradientBoostingClassifier' \
+        or estimator == 'GaussianNB' \
+        or estimator == 'LinearDiscriminantAnalysis' \
+        or estimator == 'QuadraticDiscriminantAnalysis' \
+        or estimator == 'EarthClassifier' \
+            or estimator == 'SVC':
+            mode = 'classification'
+    else:
+        mode = 'regression'
+
+    return (clf, params, mode)
+
+
+def save_training_data(X, y, groups, file):
+
+    """
+    Saves any extracted training data to a csv file
+
+    Parameters
+    ----------
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    groups: Numpy array of group labels
+    file: Path to a csv file to save data to
+
+    """
+
+    # if there are no group labels, create a nan filled array
+    if groups is None:
+        groups = np.empty((y.shape[0]))
+        groups[:] = np.nan
+
+    training_data = np.zeros((y.shape[0], X.shape[1]+2))
+    training_data[:, 0:X.shape[1]] = X
+    training_data[:, X.shape[1]] = y
+    training_data[:, X.shape[1]+1] = groups
+
+    np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+
+    """
+    Loads training data and labels from a csv file
+
+    Parameters
+    ----------
+    file: Path to a csv file to save data to
+
+    Returns
+    -------
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    groups: Numpy array of group labels, or None
+
+    """
+
+    training_data = np.loadtxt(file, delimiter=',')
+    n_features = training_data.shape[1]-1
+
+    # check to see if last column contains group labels or nans
+    groups = training_data[:, -1]
+    training_data = training_data[:, 0:n_features]
+
+    if np.isnan(groups).all() is True:
+        # if all nans then ignore last column
+        groups = None
+
+    # fetch X and y
+    X = training_data[:, 0:n_features-1]
+    y = training_data[:, -1]
+    
+    return(X, y, groups)
+
+
+def sample_predictors(response, predictors, shuffle_data, lowmem, random_state):
+
+    """
+    Samples a list of GRASS rasters using a labelled raster
+    Per raster sampling
+
+    Parameters
+    ----------
+    response: String; GRASS raster with labelled pixels
+    predictors: List of GRASS rasters containing explanatory variables
+
+    Returns
+    -------
+
+    training_data: Numpy array of extracted raster values
+    training_labels: Numpy array of labels
+    y_indexes: Row and Columns of label positions
+
+    """
+    current = Region()
+
+    # open response raster as rasterrow and read as np array
+    if RasterRow(response).exist() is True:
+        roi_gr = RasterRow(response)
+        roi_gr.open('r')
+
+        if lowmem is False:        
+            response_np = np.array(roi_gr)
+        else:
+            response_np = np.memmap(tempfile.NamedTemporaryFile(),
+                                    dtype='float32', mode='w+',
+                                    shape=(current.rows, current.cols))
+            response_np[:] = np.array(roi_gr)[:]
+    else:
+        grass.fatal("GRASS response raster does not exist.... exiting")
+
+    # determine number of predictor rasters
+    n_features = len(predictors)
+
+    # check to see if all predictors exist
+    for i in range(n_features):
+        if RasterRow(predictors[i]).exist() is not True:
+            grass.fatal("GRASS raster " + predictors[i] +
+                        " does not exist.... exiting")
+
+    # check if any of those pixels are labelled (not equal to nodata)
+    # can use even if roi is FCELL because nodata will be nan
+    is_train = np.nonzero(response_np > -2147483648)
+    training_labels = response_np[is_train]
+    n_labels = np.array(is_train).shape[1]
+
+    # Create a zero numpy array of len training labels
+    if lowmem is False:
+        training_data = np.zeros((n_labels, n_features))
+    else:
+        training_data = np.memmap(tempfile.NamedTemporaryFile(),
+                                  dtype='float32', mode='w+',
+                                  shape=(n_labels, n_features))
+
+    # Loop through each raster and sample pixel values at training indexes
+    if lowmem is True:
+        feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+					   dtype='float32', mode='w+',
+                               shape=(current.rows, current.cols))
+
+    for f in range(n_features):
+        predictor_gr = RasterRow(predictors[f])
+        predictor_gr.open('r')
+        
+        if lowmem is False:
+            feature_np = np.array(predictor_gr)
+        else:
+            feature_np[:] = np.array(predictor_gr)[:]
+
+        training_data[0:n_labels, f] = feature_np[is_train]
+        predictor_gr.close()
+
+    # convert any CELL maps no datavals to NaN in the training data
+    for i in range(n_features):
+        training_data[training_data[:, i] == -2147483648] = np.nan
+
+    # convert indexes of training pixels from tuple to n*2 np array
+    is_train = np.array(is_train).T
+
+    # Remove nan rows from training data
+    X = training_data[~np.isnan(training_data).any(axis=1)]
+    y = training_labels[~np.isnan(training_data).any(axis=1)]
+    y_indexes = is_train[~np.isnan(training_data).any(axis=1)]
+
+    roi_gr.close()
+
+    # shuffle the training data
+    if shuffle_data is True:
+        X, y, y_indexes = shuffle(X, y, y_indexes, random_state=random_state)
+
+    return(X, y, y_indexes)
+
+
+def prediction(clf, labels, predictors, scaler, class_probabilities,
+               rowincr, output, mode):
+
+    """
+    Prediction on list of GRASS rasters using a fitted scikit learn model
+
+    Parameters
+    ----------
+    clf: Scikit learn estimator object
+    labels: Numpy array of the labels used for the classification
+    predictors: List of GRASS rasters
+    scaler: Scaler for predictors, or None
+    output: Name of GRASS raster to output classification results
+    rowincr: Integer of raster rows to process at one time
+    class_probabilties: Predict class probabilities
+    mode: String, classification or regression mode
+
+    """
+
+    # create a list of rasterrow objects for predictors
+    n_features = len(predictors)
+    nclasses = len(labels)
+    rasstack = [0] * n_features
+
+    for i in range(n_features):
+        rasstack[i] = RasterRow(predictors[i])
+        if rasstack[i].exist() is True:
+            rasstack[i].open('r')
+        else:
+            grass.fatal("GRASS raster " + predictors[i] +
+                        " does not exist.... exiting")
+
+    # use grass.pygrass.gis.region to get information about the current region
+    current = Region()
+
+    # create a imagery mask
+    # the input rasters might have different dimensions and non-value pixels.
+    # r.series used to automatically create a mask by propagating the nulls
+    grass.run_command("r.series", output='tmp_clfmask',
+                      input=predictors, method='count', flags='n',
+                      overwrite=True)
+
+    mask_raster = RasterRow('tmp_clfmask')
+    mask_raster.open('r')
+
+    # create and open RasterRow objects for classification
+    classification = RasterRow(output)
+    if mode == 'classification':
+        ftype = 'CELL'
+        nodata = -2147483648
+    else:
+        ftype = 'FCELL'
+        nodata = np.nan
+    classification.open('w', ftype, overwrite=True)
+
+    # create and open RasterRow objects for  probabilities if enabled
+    if class_probabilities is True:
+        prob_out_raster = [0] * nclasses
+        prob = [0] * nclasses
+        for iclass in range(nclasses):
+            prob_out_raster[iclass] = output + \
+                '_classPr' + str(int(labels[iclass]))
+            prob[iclass] = RasterRow(prob_out_raster[iclass])
+            prob[iclass].open('w', 'FCELL', overwrite=True)
+
+    """
+    Prediction using row blocks
+    """
+
+    for rowblock in range(0, current.rows, rowincr):
+
+        # check that the row increment does not exceed the number of rows
+        if rowblock+rowincr > current.rows:
+            rowincr = current.rows - rowblock
+        img_np_row = np.zeros((rowincr, current.cols, n_features))
+        mask_np_row = np.zeros((rowincr, current.cols))
+
+        # loop through each row, and each band
+        # and add these values to the 2D array img_np_row
+        for row in range(rowblock, rowblock+rowincr, 1):
+            mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
+
+            for band in range(n_features):
+                img_np_row[row-rowblock, :, band] = \
+                    np.array(rasstack[band][row])
+
+        mask_np_row[mask_np_row == -2147483648] = np.nan
+        nanmask = np.isnan(mask_np_row)  # True in the mask means invalid data
+
+        # reshape each row-band matrix into a n*m array
+        nsamples = rowincr * current.cols
+        flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+        # remove NaN values
+        flat_pixels = np.nan_to_num(flat_pixels)
+
+        # rescale
+        if scaler is not None:
+            flat_pixels = scaler.transform(flat_pixels)
+
+        # perform prediction
+        result = clf.predict(flat_pixels)
+        result = result.reshape((rowincr, current.cols))
+
+        # replace NaN values so that the prediction does not have a border
+        result = np.ma.masked_array(result, mask=nanmask, fill_value=-99999)
+
+        # return a copy of result, with masked values filled with a value
+        result = result.filled([nodata])
+
+        # for each row we can perform computation, and write the result into
+        for row in range(rowincr):
+            newrow = Buffer((result.shape[1],), mtype=ftype)
+            newrow[:] = result[row, :]
+            classification.put_row(newrow)
+
+        # same for probabilities
+        if class_probabilities is True and mode is 'classification':
+            result_proba = clf.predict_proba(flat_pixels)
+
+            for iclass in range(result_proba.shape[1]):
+
+                result_proba_class = result_proba[:, iclass]
+                result_proba_class = result_proba_class.reshape(
+                                        (rowincr, current.cols))
+
+                result_proba_class = np.ma.masked_array(
+                    result_proba_class, mask=nanmask, fill_value=np.nan)
+
+                result_proba_class = result_proba_class.filled([np.nan])
+
+                for row in range(rowincr):
+
+                    newrow = Buffer((
+                                result_proba_class.shape[1],), mtype='FCELL')
+
+                    newrow[:] = result_proba_class[row, :]
+                    prob[iclass].put_row(newrow)
+
+    classification.close()
+    mask_raster.close()
+
+    if class_probabilities is True and mode is 'classification':
+        for iclass in range(nclasses):
+            prob[iclass].close()
+
+
+def cross_val_classification(clf, X, y, group_ids, cv, rstate):
+
+    """
+    Stratified Kfold cross-validation
+    Generates several scoring_metrics
+
+    Parameters
+    ----------
+    clf: Scikit learn estimator object
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    group_ids: Numpy array containing group labels for samples
+    cv: Integer of cross-validation folds
+    rstate: Seed to pass to the random number generator
+
+    Returns
+    -------
+    scores: Dictionary of global accuracy measures per fold
+    y_test_agg: Aggregated test responses
+    y_pred_agg: Aggregated predicted responses
+
+    """
+
+    # dicts of lists to store metrics
+    scores = {
+        'accuracy': [],
+        'auc': [],
+        'r2': []
+    }
+
+    # generate Kfold indices
+    if group_ids is None:
+        k_fold = StratifiedKFold(
+            n_splits=cv, shuffle=False, random_state=rstate).split(X, y)
+    else:
+
+        n_groups = len(np.unique(group_ids))
+
+        if cv > n_groups:
+            grass.message(
+                "Number of cv folds cannot be larger than the number of groups in the zonal raster, using n_groups")
+            cv = n_groups
+        k_fold = GroupKFold(n_splits=cv).split(X, y, groups=group_ids)
+
+    y_test_agg = []
+    y_pred_agg = []
+
+    # loop through each fold
+    for train_indices, test_indices in k_fold:
+
+        X_train, X_test = X[train_indices], X[test_indices]
+        y_train, y_test = y[train_indices], y[test_indices]
+
+        # fit the model on the training data and predict the test data
+        fit = clf.fit(X_train, y_train)
+        y_pred = fit.predict(X_test)
+
+        y_test_agg = np.append(y_test_agg, y_test)
+        y_pred_agg = np.append(y_pred_agg, y_pred)
+
+        # calculate metrics
+        try:
+            scores['accuracy'] = np.append(
+                scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+        except:
+            pass
+
+        scores['r2'] = np.append(
+            scores['r2'], metrics.r2_score(y_test, y_pred))
+
+        # test for if binary and classes equal 0,1
+        if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+            try:
+                y_pred_proba = fit.predict_proba(X_test)[:, 1]
+
+                scores['auc'] = np.append(
+                    scores['auc'], metrics.roc_auc_score(y_test, y_pred_proba))
+            except:
+                pass
+
+    return(scores, y_test_agg, y_pred_agg)
+
+
+def tune_split(X, y, Id, estimator, metric, params, test_size, random_state):
+
+    if Id is None:
+        X, X_devel, y, y_devel = train_test_split(X, y, test_size=test_size,
+                            random_state=random_state)
+        Id_devel = None
+    else:
+        X, X_devel, y, y_devel, Id, Id_devel = train_test_split(X, y, Id, test_size=test_size,
+                            random_state=random_state, stratify=Id)
+
+    clf = GridSearchCV(estimator=estimator, cv=3, param_grid=params,
+                              scoring=metric, n_jobs=-1)
+    
+    clf.fit(X_devel, y_devel)
+
+    return (X, X_devel, y, y_devel, Id, Id_devel, clf)
+
+
+def feature_importances(clf, X, y):
+
+    try:
+        clfimp = clf.feature_importances_
+    except:
+        sk = SelectKBest(f_classif, k='all')
+        sk_fit = sk.fit(X, y)
+        clfimp = sk_fit.scores_
+
+    return (clfimp)
+
+
+def sample_training_data(roi, maplist, cv, cvtype, model_load,
+                         load_training, save_training, lowmem, random_state):
+    
+    # load the model or training data
+    if model_load != '':
+        clf = joblib.load(model_load)
+        X, y, Id = load_training_data(
+                            model_load.replace('.pkl', '.csv'))
+    else:
+        clf = None
+        if load_training != '':
+            X, y, Id = load_training_data(load_training)
+        else:
+            # create clumped roi for spatial cross validation
+            if cv > 1 and cvtype == 'clumped':
+                r.clump(input=roi, output='tmp_roi_clumped', overwrite=True, quiet=True)
+                maplist2 = copy.deepcopy(maplist)
+                maplist2.append('tmp_roi_clumped')
+                X, y, sample_coords = sample_predictors(response=roi,
+                                                        predictors=maplist2,
+                                                        shuffle_data=False,
+                                                        lowmem=lowmem,
+                                                        random_state=random_state)
+                 # take Id from last column
+                Id = X[:, -1]
+
+                # remove Id column from predictors
+                X = X[:, 0:X.shape[1]-1]
+            else:
+                # query predictor rasters with training features
+                Id = None
+                X, y, sample_coords = sample_predictors(
+                    response=roi, predictors=maplist, shuffle_data=True,
+                    lowmem=lowmem, random_state=random_state)
+                
+                # perform kmeans clustering on point coordinates
+                if cv > 1 and cvtype == 'kmeans':
+                    clusters = KMeans(
+                        n_clusters=cv, random_state=random_state, n_jobs=-1)
+                    clusters.fit(sample_coords)
+                    Id = clusters.labels_
+
+            if save_training != '':
+                save_training_data(X, y, Id, save_training)
+
+    
+    return (X, y, Id, clf)
+
+
 def main():
 
     """