[GRASS-SVN] r69865 - grass-addons/grass7/raster/r.randomforest

svn_grass at osgeo.org svn_grass at osgeo.org
Mon Nov 21 22:35:34 PST 2016


Author: spawley
Date: 2016-11-21 22:35:34 -0800 (Mon, 21 Nov 2016)
New Revision: 69865

Added:
   grass-addons/grass7/raster/r.randomforest/ml_classifiers.py
   grass-addons/grass7/raster/r.randomforest/ml_utils.py
Modified:
   grass-addons/grass7/raster/r.randomforest/Makefile
   grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
'update to r.randomforest to add spatial cross-validation and simplify options'

Modified: grass-addons/grass7/raster/r.randomforest/Makefile
===================================================================
--- grass-addons/grass7/raster/r.randomforest/Makefile	2016-11-21 20:59:01 UTC (rev 69864)
+++ grass-addons/grass7/raster/r.randomforest/Makefile	2016-11-22 06:35:34 UTC (rev 69865)
@@ -2,6 +2,9 @@
 
 PGM = r.randomforest
 
+ETCFILES = ml_classifiers ml_utils
+
 include $(MODULE_TOPDIR)/include/Make/Script.make
+include $(MODULE_TOPDIR)/include/Make/Python.make
 
 default: script

Added: grass-addons/grass7/raster/r.randomforest/ml_classifiers.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/ml_classifiers.py	                        (rev 0)
+++ grass-addons/grass7/raster/r.randomforest/ml_classifiers.py	2016-11-22 06:35:34 UTC (rev 69865)
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+from sklearn.naive_bayes import GaussianNB
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.svm import SVC
+
+
+def model_classifiers(estimator, random_state, class_weight,
+                      C, max_depth, max_features,
+                      min_samples_split, min_samples_leaf,
+                      n_estimators, subsample, learning_rate):
+
+    classifiers = {
+        'SVC': SVC(C=C, probability=True, random_state=random_state),
+        'LogisticRegression': LogisticRegression(C=C, class_weight=class_weight,
+                                                 random_state=random_state),
+        'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=max_depth,
+                                                         max_features=max_features,
+                                                         min_samples_split=min_samples_split,
+                                                         min_samples_leaf=min_samples_leaf,
+                                                         random_state=random_state,
+                                                         class_weight=class_weight),
+        'DecisionTreeRegressor': DecisionTreeRegressor(max_features=max_features,
+                                                       min_samples_split=min_samples_split,
+                                                       min_samples_leaf=min_samples_leaf,
+                                                       random_state=random_state),
+        'RandomForestClassifier': RandomForestClassifier(n_estimators=n_estimators,
+                                                         class_weight=class_weight,
+                                                         max_features=max_features,
+                                                         min_samples_split=min_samples_split,
+                                                         random_state=random_state,
+                                                         n_jobs=-1,
+                                                         oob_score=True),
+        'RandomForestRegressor': RandomForestRegressor(n_estimators=n_estimators,
+                                                       max_features=max_features,
+                                                       min_samples_split=min_samples_split,
+                                                       random_state=random_state,
+                                                       n_jobs=-1,
+                                                       oob_score=True),
+        'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=learning_rate,
+                                                                 n_estimators=n_estimators,
+                                                                 max_depth=max_depth,
+                                                                 min_samples_split=min_samples_split,
+                                                                 min_samples_leaf=min_samples_leaf,
+                                                                 subsample=subsample,
+                                                                 max_features=max_features,
+                                                                 random_state=random_state),
+        'GradientBoostingRegressor': GradientBoostingRegressor(learning_rate=learning_rate,
+                                                               n_estimators=n_estimators,
+                                                               max_depth=max_depth,
+                                                               min_samples_split=min_samples_split,
+                                                               min_samples_leaf=min_samples_leaf,
+                                                               subsample=subsample,
+                                                               max_features=max_features,
+                                                               random_state=random_state),
+        'GaussianNB': GaussianNB(),
+        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+    }
+
+    SVCOpts = {'C': [1, 10, 100], 'shrinking': [True, False]}
+    LogisticRegressionOpts = {'C': [1, 10, 100, 1000]}
+    DecisionTreeOpts = {'max_depth': [2, 4, 6, 8, 10, 20],
+                        'max_features': ['sqrt', 'log2', None],
+                        'min_samples_split': [2, 0.01, 0.05, 0.1, 0.25]}
+    RandomForestOpts = {'max_features': ['sqrt', 'log2', None]}
+    GradientBoostingOpts = {'learning_rate': [0.01, 0.02, 0.05, 0.1],
+                            'max_depth': [3, 4, 6, 10],
+                            'max_features': ['sqrt', 'log2', None],
+                            'n_estimators': [50, 100, 150, 250, 500]}
+
+    param_grids = {
+        'SVC': SVCOpts,
+        'LogisticRegression': LogisticRegressionOpts,
+        'DecisionTreeClassifier': DecisionTreeOpts,
+        'DecisionTreeRegressor': DecisionTreeOpts,
+        'RandomForestClassifier': RandomForestOpts,
+        'RandomForestRegressor': RandomForestOpts,
+        'GradientBoostingClassifier': GradientBoostingOpts,
+        'GradientBoostingRegressor': GradientBoostingOpts,
+        'GaussianNB': {},
+        'LinearDiscriminantAnalysis': {},
+        'QuadraticDiscriminantAnalysis': {},
+    }
+    
+    # define classifier
+    clf = classifiers[estimator]
+    params = param_grids[estimator]
+    
+    # classification or regression
+    if estimator == 'LogisticRegression' \
+        or estimator == 'DecisionTreeClassifier' \
+        or estimator == 'RandomForestClassifier' \
+        or estimator == 'GradientBoostingClassifier' \
+        or estimator == 'GaussianNB' \
+        or estimator == 'LinearDiscriminantAnalysis' \
+        or estimator == 'QuadraticDiscriminantAnalysis' \
+            or estimator == 'SVC':
+            mode = 'classification'
+    else:
+        mode = 'regression'
+
+    return (clf, params, mode)

Added: grass-addons/grass7/raster/r.randomforest/ml_utils.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/ml_utils.py	                        (rev 0)
+++ grass-addons/grass7/raster/r.randomforest/ml_utils.py	2016-11-22 06:35:34 UTC (rev 69865)
@@ -0,0 +1,513 @@
+import os
+import numpy as np
+import grass.script as grass
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from sklearn.model_selection import StratifiedKFold
+from sklearn import metrics
+from sklearn.model_selection import GroupKFold
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV
+from sklearn import preprocessing
+from sklearn.feature_selection import SelectKBest
+from sklearn.feature_selection import f_classif
+from sklearn.utils import shuffle
+
+from sklearn.externals import joblib
+from grass.pygrass.modules.shortcuts import raster as r
+from sklearn.cluster import KMeans
+
+
+
+def save_training_data(X, y, groups, file):
+
+    """
+    Saves any extracted training data to a csv file
+
+    Parameters
+    ----------
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    groups: Numpy array of group labels
+    file: Path to a csv file to save data to
+
+    """
+
+    # if there are no group labels, create a nan filled array
+    if groups is None:
+        groups = np.empty((y.shape[0]))
+        groups[:] = np.nan
+
+    training_data = np.zeros((y.shape[0], X.shape[1]+2))
+    training_data[:, 0:X.shape[1]] = X
+    training_data[:, X.shape[1]] = y
+    training_data[:, X.shape[1]+1] = groups
+
+    np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+
+    """
+    Loads training data and labels from a csv file
+
+    Parameters
+    ----------
+    file: Path to a csv file to save data to
+
+    Returns
+    -------
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    groups: Numpy array of group labels, or None
+
+    """
+
+    training_data = np.loadtxt(file, delimiter=',')
+
+    # check to see if last column contains group labels or nans
+    lastcol = training_data[:, training_data.shape[1]-1]
+
+    if np.isnan(lastcol).all() is True:
+        n_features = training_data.shape[1]-1
+        groups = lastcol
+    else:
+        n_features = training_data.shape[1]
+        groups = None
+
+    # retreave X and y
+    X = training_data[:, 0:n_features-1]
+    y = training_data[:, n_features-1]
+
+    return(X, y, groups)
+
+
+def sample_predictors(response, predictors, shuffle_data=True, random_state=1):
+
+    """
+    Samples a list of GRASS rasters using a labelled raster
+    Per raster sampling
+
+    Parameters
+    ----------
+    response: String; GRASS raster with labelled pixels
+    predictors: List of GRASS rasters containing explanatory variables
+
+    Returns
+    -------
+
+    training_data: Numpy array of extracted raster values
+    training_labels: Numpy array of labels
+    y_indexes: Row and Columns of label positions
+
+    """
+
+    # open response raster as rasterrow and read as np array
+    if RasterRow(response).exist() is True:
+        roi_gr = RasterRow(response)
+        roi_gr.open('r')
+        response_np = np.array(roi_gr)
+    else:
+        grass.fatal("GRASS response raster does not exist.... exiting")
+
+    # determine number of predictor rasters
+    n_features = len(predictors)
+
+    # check to see if all predictors exist
+    for i in range(n_features):
+        if RasterRow(predictors[i]).exist() is not True:
+            grass.fatal("GRASS raster " + predictors[i] +
+                        " does not exist.... exiting")
+
+    # check if any of those pixels are labelled (not equal to nodata)
+    # can use even if roi is FCELL because nodata will be nan
+    is_train = np.nonzero(response_np > -2147483648)
+    training_labels = response_np[is_train]
+    n_labels = np.array(is_train).shape[1]
+
+    # Create a zero numpy array of len training labels
+    training_data = np.zeros((n_labels, n_features))
+
+    # Loop through each raster and sample pixel values at training indexes
+    for f in range(n_features):
+        predictor_gr = RasterRow(predictors[f])
+        predictor_gr.open('r')
+        feature_np = np.array(predictor_gr)
+        training_data[0:n_labels, f] = feature_np[is_train]
+        predictor_gr.close()
+
+    # convert any CELL maps no datavals to NaN in the training data
+    for i in range(n_features):
+        training_data[training_data[:, i] == -2147483648] = np.nan
+
+    # convert indexes of training pixels from tuple to n*2 np array
+    is_train = np.array(is_train).T
+
+    # Remove nan rows from training data
+    X = training_data[~np.isnan(training_data).any(axis=1)]
+    y = training_labels[~np.isnan(training_data).any(axis=1)]
+    y_indexes = is_train[~np.isnan(training_data).any(axis=1)]
+
+    roi_gr.close()
+
+    # shuffle the training data
+    if shuffle_data is True:
+        X, y, y_indexes = shuffle(X, y, y_indexes, random_state=random_state)
+
+    return(X, y, y_indexes)
+
+
+def prediction(clf, labels, predictors, scaler, class_probabilities,
+               rowincr, output, mode):
+
+    """
+    Prediction on list of GRASS rasters using a fitted scikit learn model
+
+    Parameters
+    ----------
+    clf: Scikit learn estimator object
+    labels: Numpy array of the labels used for the classification
+    predictors: List of GRASS rasters
+    scaler: Scaler for predictors, or None
+    output: Name of GRASS raster to output classification results
+    rowincr: Integer of raster rows to process at one time
+    class_probabilties: Predict class probabilities
+    mode: String, classification or regression mode
+
+    """
+
+    # create a list of rasterrow objects for predictors
+    n_features = len(predictors)
+    nclasses = len(labels)
+    rasstack = [0] * n_features
+
+    for i in range(n_features):
+        rasstack[i] = RasterRow(predictors[i])
+        if rasstack[i].exist() is True:
+            rasstack[i].open('r')
+        else:
+            grass.fatal("GRASS raster " + predictors[i] +
+                        " does not exist.... exiting")
+
+    # use grass.pygrass.gis.region to get information about the current region
+    current = Region()
+
+    # create a imagery mask
+    # the input rasters might have different dimensions and non-value pixels.
+    # r.series used to automatically create a mask by propagating the nulls
+    grass.run_command("r.series", output='tmp_clfmask',
+                      input=predictors, method='count', flags='n',
+                      overwrite=True)
+
+    mask_raster = RasterRow('tmp_clfmask')
+    mask_raster.open('r')
+
+    # create and open RasterRow objects for classification
+    classification = RasterRow(output)
+    if mode == 'classification':
+        ftype = 'CELL'
+        nodata = -2147483648
+    else:
+        ftype = 'FCELL'
+        nodata = np.nan
+    classification.open('w', ftype, overwrite=True)
+
+    # create and open RasterRow objects for  probabilities if enabled
+    if class_probabilities is True:
+        prob_out_raster = [0] * nclasses
+        prob = [0] * nclasses
+        for iclass in range(nclasses):
+            prob_out_raster[iclass] = output + \
+                '_classPr' + str(int(labels[iclass]))
+            prob[iclass] = RasterRow(prob_out_raster[iclass])
+            prob[iclass].open('w', 'FCELL', overwrite=True)
+
+    """
+    Prediction using row blocks
+    """
+
+    for rowblock in range(0, current.rows, rowincr):
+
+        # check that the row increment does not exceed the number of rows
+        if rowblock+rowincr > current.rows:
+            rowincr = current.rows - rowblock
+        img_np_row = np.zeros((rowincr, current.cols, n_features))
+        mask_np_row = np.zeros((rowincr, current.cols))
+
+        # loop through each row, and each band
+        # and add these values to the 2D array img_np_row
+        for row in range(rowblock, rowblock+rowincr, 1):
+            mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
+
+            for band in range(n_features):
+                img_np_row[row-rowblock, :, band] = \
+                    np.array(rasstack[band][row])
+
+        mask_np_row[mask_np_row == -2147483648] = np.nan
+        nanmask = np.isnan(mask_np_row)  # True in the mask means invalid data
+
+        # reshape each row-band matrix into a n*m array
+        nsamples = rowincr * current.cols
+        flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+        # remove NaN values
+        flat_pixels = np.nan_to_num(flat_pixels)
+
+        # rescale
+        if scaler is not None:
+            flat_pixels = scaler.transform(flat_pixels)
+
+        # perform prediction
+        result = clf.predict(flat_pixels)
+        result = result.reshape((rowincr, current.cols))
+
+        # replace NaN values so that the prediction does not have a border
+        result = np.ma.masked_array(result, mask=nanmask, fill_value=-99999)
+
+        # return a copy of result, with masked values filled with a value
+        result = result.filled([nodata])
+
+        # for each row we can perform computation, and write the result into
+        for row in range(rowincr):
+            newrow = Buffer((result.shape[1],), mtype=ftype)
+            newrow[:] = result[row, :]
+            classification.put_row(newrow)
+
+        # same for probabilities
+        if class_probabilities is True and mode is 'classification':
+            result_proba = clf.predict_proba(flat_pixels)
+
+            for iclass in range(result_proba.shape[1]):
+
+                result_proba_class = result_proba[:, iclass]
+                result_proba_class = result_proba_class.reshape(
+                                        (rowincr, current.cols))
+
+                result_proba_class = np.ma.masked_array(
+                    result_proba_class, mask=nanmask, fill_value=np.nan)
+
+                result_proba_class = result_proba_class.filled([np.nan])
+
+                for row in range(rowincr):
+
+                    newrow = Buffer((
+                                result_proba_class.shape[1],), mtype='FCELL')
+
+                    newrow[:] = result_proba_class[row, :]
+                    prob[iclass].put_row(newrow)
+
+    classification.close()
+    mask_raster.close()
+
+    if class_probabilities is True and mode is 'classification':
+        for iclass in range(nclasses):
+            prob[iclass].close()
+
+
+def cross_val_classification(clf, X, y, group_ids, cv, rstate):
+
+    """
+    Stratified Kfold cross-validation
+    Generates several scoring_metrics
+
+    Parameters
+    ----------
+    clf: Scikit learn estimator object
+    X: Numpy array containing predictor values
+    y: Numpy array containing labels
+    group_ids: Numpy array containing group labels for samples
+    cv: Integer of cross-validation folds
+    rstate: Seed to pass to the random number generator
+
+    Returns
+    -------
+    scores: Dictionary of global accuracy measures per fold
+    y_test_agg: Aggregated test responses
+    y_pred_agg: Aggregated predicted responses
+
+    """
+
+    # dicts of lists to store metrics
+    scores = {
+        'accuracy': [],
+        'auc': [],
+        'r2': []
+    }
+
+    # generate Kfold indices
+    if group_ids is None:
+        k_fold = StratifiedKFold(
+            n_splits=cv, shuffle=False, random_state=rstate).split(X, y)
+    else:
+
+        n_groups = len(np.unique(group_ids))
+
+        if cv > n_groups:
+            grass.message(
+                "Number of cv folds cannot be larger than the number of groups in the zonal raster, using n_groups")
+            cv = n_groups
+        k_fold = GroupKFold(n_splits=cv).split(X, y, groups=group_ids)
+
+    y_test_agg = []
+    y_pred_agg = []
+
+    # loop through each fold
+    for train_indices, test_indices in k_fold:
+
+        X_train, X_test = X[train_indices], X[test_indices]
+        y_train, y_test = y[train_indices], y[test_indices]
+
+        # fit the model on the training data and predict the test data
+        fit = clf.fit(X_train, y_train)
+        y_pred = fit.predict(X_test)
+
+        y_test_agg = np.append(y_test_agg, y_test)
+        y_pred_agg = np.append(y_pred_agg, y_pred)
+
+        # calculate metrics
+        scores['accuracy'] = np.append(
+            scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+
+        scores['r2'] = np.append(
+            scores['r2'], metrics.r2_score(y_test, y_pred))
+
+        # test for if binary and classes equal 0,1
+        if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+            try:
+                y_pred_proba = fit.predict_proba(X_test)[:, 1]
+
+                scores['auc'] = np.append(
+                    scores['auc'], metrics.roc_auc_score(y_test, y_pred_proba))
+            except:
+                pass
+
+    return(scores, y_test_agg, y_pred_agg)
+
+
+def tune_split(X, y, Id, estimator, params, test_size, random_state):
+
+    if Id is None:
+        X, X_devel, y, y_devel = train_test_split(X, y, test_size=test_size,
+                            random_state=random_state)
+        Id_devel = None
+    else:
+        X, X_devel, y, y_devel, Id, Id_devel = train_test_split(X, y, Id, test_size=test_size,
+                            random_state=random_state, stratify=Id)
+
+    clf = GridSearchCV(estimator=estimator, cv=3, param_grid=params,
+                              scoring="accuracy", n_jobs=-1)
+    
+    clf.fit(X_devel, y_devel)
+
+    return (X, X_devel, y, y_devel, Id, Id_devel, clf)
+
+
+def feature_importances(clf, X, y):
+
+    min_max_scaler = preprocessing.MinMaxScaler()
+
+    try:
+        clfimp = min_max_scaler.fit_transform(
+                    clf.feature_importances_.reshape(-1, 1))
+    except:
+        try:
+            clfimp = min_max_scaler.fit_transform(
+                        abs(clf.coef_.T).reshape(-1, 1))
+        except:
+            sk = SelectKBest(f_classif, k='all')
+            sk_fit = sk.fit(X, y)
+            clfimp = min_max_scaler.fit_transform(
+                        sk_fit.scores_.reshape(-1, 1))
+
+    return (clfimp)
+
+
+def sample_training_data(roi, maplist, cv, cvtype, model_load, model_save, load_training, save_training, random_state):
+    
+    # load the model or training data
+    if model_load != '':
+        clf = joblib.load(model_load)
+        X, y, Id = load_training_data(
+                            model_load.replace('.pkl', '.csv'))
+    else:
+        clf = None
+        if load_training != '':
+            X, y, Id = load_training_data(load_training)
+        else:
+            # create clumped roi for spatial cross validation
+            if cv > 1 and cvtype == 'clumped':
+                r.clump(input=roi, output='tmp_roi_clumped', overwrite=True, quiet=True)
+                maplist2 = maplist
+                maplist2.append('tmp_roi_clumped')
+                X, y, sample_coords = sample_predictors(response=roi,
+                                                        predictors=maplist2,
+                                                        shuffle_data=False)
+                 # take Id from last column
+                Id = X[:, X.shape[1]-1]
+
+                # remove Id column from predictors
+                X = X[:, 0:X.shape[1]]
+            else:
+                # query predictor rasters with training features
+                Id = None
+                X, y, sample_coords = sample_predictors(
+                    roi, maplist, shuffle_data=True, random_state=random_state)
+
+            if save_training != '':
+                save_training_data(X, y, Id, save_training)
+                
+            if model_save != '':
+                save_training_data(X, y, Id, model_save + ".csv")
+
+    # perform kmeans clustering on point coordinates
+    if cv > 1 and cvtype == 'kmeans':
+        clusters = KMeans(
+            n_clusters=cv, random_state=random_state, n_jobs=-1)
+        clusters.fit(sample_coords)
+        Id = clusters.labels_
+
+    return (X, y, Id, clf)
+
+
+def classifier_comparision(classifiers, X, y, cv, scoring_metric, param_grids=None):
+
+    import pandas as pd
+    from sklearn import cross_validation
+
+    # compare multiple models with parameter grid search
+
+    # lists and pandas dataframes to store model and accuracy results
+    n_models = len(classifiers)
+    model = [0] * n_models
+    cmstats = [0] * n_models
+    comparison_df = pd.DataFrame(index=range(n_models), columns=['Model',
+                                                                 'mean_score',
+                                                                 'min_score',
+                                                                 'max_score',
+                                                                 'std_score',
+                                                                 'scores'])
+
+    # perform cross validation on each classifer and param_grid in list
+    for clfm, i in zip(classifiers, range(n_models)):
+
+        # use nested cross validation with parameter tuning
+        if param_grids is not None:
+            model[i], cmstats[i] = (
+                nested_cross_val(classifiers[clfm], X, y, param_grids[clfm],
+                                 cv=cv, scoring_metric=scoring_metric))
+
+        # or use default classifier settings with no tuning
+        else:
+            cmstats[i] = cross_validation.cross_val_score(
+                classifiers[clfm], X, y, scoring=scoring_metric, cv=cv)
+
+        comparison_df.iloc[i, 0] = clfm
+        comparison_df.iloc[i, 1] = cmstats[i].mean()
+        comparison_df.iloc[i, 2] = cmstats[i].min()
+        comparison_df.iloc[i, 3] = cmstats[i].max()
+        comparison_df.iloc[i, 4] = cmstats[i].std()
+        comparison_df.iloc[i, 5] = cmstats[i]
+
+    return (comparison_df)
+

Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-11-21 20:59:01 UTC (rev 69864)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py	2016-11-22 06:35:34 UTC (rev 69865)
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 ############################################################################
-# MODULE:       r.randomforest
-# AUTHOR:       Steven Pawley
-# PURPOSE:      Supervised classification and regression of GRASS rasters using the 
-#               python scikit-learn package
+# MODULE:        r.randomforest
+# AUTHOR:        Steven Pawley
+# PURPOSE:       Supervised classification and regression of GRASS rasters
+#                using the python scikit-learn package
 #
 # COPYRIGHT: (c) 2016 Steven Pawley, and the GRASS Development Team
 #                This program is free software under the GNU General Public
@@ -49,169 +49,113 @@
 #% label: Classifier
 #% description: Supervised learning model to use
 #% answer: RandomForestClassifier
-#% options: LogisticRegression,LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis,DecisionTreeClassifier,DecisionTreeRegressor,RandomForestClassifier,RandomForestRegressor,GradientBoostingClassifier,GradientBoostingRegressor,GaussianNB
+#% options: LogisticRegression,LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis,GaussianNB,DecisionTreeClassifier,DecisionTreeRegressor,RandomForestClassifier,RandomForestRegressor,GradientBoostingClassifier,GradientBoostingRegressor,SVC
 #%end
 
-# Logistic regression options
-
 #%option double
-#% key: c_lr
-#% description: Inverse of regularization strength
+#% key: c
+#% description: Inverse of regularization strength (logistic regresson and SVC)
 #% answer: 1.0
-#% guisection: Logistic Regression
+#% guisection: Classifier Parameters
 #%end
 
-#%flag
-#% key: i
-#% description: Fit intercept in logistic regression
-#% guisection: Logistic Regression
+#%option
+#% key: max_features
+#% type: integer
+#% description: Number of features to consider during splitting for tree-based classifiers. Default -1 is sqrt(n_features) for classification, and n_features for regression
+#% answer: -1
+#% guisection: Classifier Parameters
 #%end
 
-# Decision tree options
-#%option string
-#% key: splitter_dt
-#% description: The strategy used to choose the split at each node
-#% answer: best
-#% options: best,random
-#% guisection: Decision Tree
-#%end
-
 #%option
-#% key: m_features_dt
+#% key: max_depth
 #% type: integer
-#% description: The number of features to consider when looking for the best split. Default -1 is sqrt(n_features) for classification, and n_features for regression
+#% description: Maximum tree depth for tree-based classifiers. Value of -1 uses classifier defaults
 #% answer: -1
-#% guisection: Decision Tree
+#% guisection: Classifier Parameters
 #%end
 
 #%option
-#% key: min_samples_split_dt
+#% key: min_samples_split
 #% type: integer
-#% description: The minimum number of samples required to split an internal node
+#% description: The minimum number of samples required to split an internal node for tree-based classifiers
 #% answer: 2
-#% guisection: Decision Tree
+#% guisection: Classifier Parameters
 #%end
 
 #%option
-#% key: min_samples_leaf_dt
+#% key: min_samples_leaf
 #% type: integer
-#% description: The minimum number of samples required to be at a leaf node
+#% description: The minimum number of samples required to be at a leaf node for tree-based classifiers
 #% answer: 1
-#% guisection: Decision Tree
+#% guisection: Classifier Parameters
 #%end
 
 #%option
-#% key: min_weight_fraction_leaf_dt
+#% key: n_estimators
 #% type: integer
-#% description: The minimum weighted fraction of the input samples required to be at a leaf node
-#% answer: 0
-#% guisection: Decision Tree
-#%end
-
-# Random Forest Options
-
-#%option
-#% key: ntrees_rf
-#% type: integer
-#% description: Number of trees in the forest
+#% description: Number of estimators for tree-based classifiers
 #% answer: 500
-#% guisection: Random Forest
+#% guisection: Classifier Parameters
 #%end
 
 #%option
-#% key: m_features_rf
-#% type: integer
-#% description: The number of features allowed at each split. Default -1 is sqrt(n_features) for classification, and n_features for regression
-#% answer: -1
-#% guisection: Random Forest
+#% key: learning_rate
+#% type: double
+#% description: learning rate for gradient boosting
+#% answer: 0.1
+#% guisection: Classifier Parameters
 #%end
 
 #%option
-#% key: minsplit_rf
-#% type: integer
-#% description: The minimum number of samples required to split a node
-#% answer: 2
-#% guisection: Random Forest
+#% key: subsample
+#% type: double
+#% description: The fraction of samples to be used for fitting for gradient boosting
+#% answer: 1.0
+#% guisection: Classifier Parameters
 #%end
 
-# Gradient tree boosting options
+# General options
 
-#%option
-#% key: learning_rate_gtb
-#% type: double
-#% description: learning rate shrinks the contribution of each tree
-#% answer: 0.1
-#% guisection: Gradient Boosted Trees
+#%flag
+#% key: g
+#% label: Print as a shell style script
+#% guisection: Optional
 #%end
 
-#%option
-#% key: n_estimators_gtb
-#% type: integer
-#% description: The number of boosting stages to perform
-#% answer: 100
-#% guisection: Gradient Boosted Trees
-#%end
 
-#%option
-#% key: max_depth_gtb
-#% type: integer
-#% description: The maximum depth limits the number of nodes in the tree
-#% answer: 3
-#% guisection: Gradient Boosted Trees
+#%flag
+#% key: n
+#% label: Normalization
+#% guisection: Optional
 #%end
 
 #%option
-#% key: min_samples_split_gtb
+#% key: cv
 #% type: integer
-#% description: The minimum number of samples required to split an internal node
-#% answer: 2
-#% guisection: Gradient Boosted Trees
-#%end
-
-#%option
-#% key: min_samples_leaf_gtb
-#% type: integer
-#% description: The minimum number of samples required to be at a leaf node
+#% description: Number of cross-validation folds to perform in cv > 1
 #% answer: 1
-#% guisection: Gradient Boosted Trees
+#% guisection: Optional
 #%end
 
-#%option
-#% key: min_weight_fraction_leaf_gtb
-#% type: double
-#% description: The minimum weighted fraction of the input samples required to be at a leaf node
-#% answer: 0.
-#% guisection: Gradient Boosted Trees
+#%option string
+#% key: cvtype
+#% required: no
+#% label: Non-spatial or spatial cross-validation
+#% description: Non-spatial, clumped or clustered k-fold cross-validation
+#% answer: Non-spatial
+#% options: non-spatial,clumped,kmeans
 #%end
 
-#%option
-#% key: subsample_gtb
-#% type: double
-#% description: The fraction of samples to be used for fitting the individual base learners
-#% answer: 1.0
-#% guisection: Gradient Boosted Trees
-#%end
-
-#%option
-#% key: max_features_gtb
-#% type: integer
-#% description: The number of features to consider during splitting. Default -1 is sqrt(n_features) for classification mode, and n_features for regression mode
-#% answer: -1
-#% guisection: Gradient Boosted Trees
-#%end
-
-# General options
-
-#%option
-#% key: cv
-#% type: integer
-#% description: Use k-fold cross-validation when cv > 1
-#% answer: 1
+#%option G_OPT_F_OUTPUT
+#% key: errors_file
+#% label: Save cross-validation global accuracy results to csv
+#% required: no
 #% guisection: Optional
 #%end
 
 #%option
-#% key: randst
+#% key: random_state
 #% type: integer
 #% description: Seed to pass onto the random state for reproducible results
 #% answer: 1
@@ -226,14 +170,6 @@
 #% guisection: Optional
 #%end
 
-#%option
-#% key: ncores
-#% type: integer
-#% description: Number of processing cores. Default -1 is all cores
-#% answer: -1
-#% guisection: Optional
-#%end
-
 #%flag
 #% key: p
 #% label: Output class membership probabilities
@@ -248,934 +184,336 @@
 
 #%flag
 #% key: f
-#% description: Output feature importances for ensemble tree-based models
+#% description: Calculate feature importances using bagging or univariate methods
 #% guisection: Optional
 #%end
 
+#%option G_OPT_F_OUTPUT
+#% key: fimp_file
+#% label: Save feature importances to csv
+#% required: no
+#% guisection: Optional
+#%end
+
 #%flag
 #% key: b
-#% description: Balance number of observations by weighting for logistic regression and tree-based classifiers
+#% description: Balance number of observations by weighting for logistic regression, CART and RF methods
 #% guisection: Optional
 #%end
 
 #%flag
-#% key: l
-#% description: Low memory version - samples predictors row-by-row (slower)
+#% key: h
+#% description: Perform parameter tuning on a split of the training data
 #% guisection: Optional
 #%end
 
+#%option
+#% key: ratio
+#% type: double
+#% description: Percentage of training data to use for model development and tuning
+#% answer: 0.25
+#% guisection: Optional
+#%end
+
 #%option G_OPT_F_OUTPUT
-#% key: savefile
-#% label: Save model from file
+#% key: save_training
+#% label: Save training data to csv
 #% required: no
 #% guisection: Optional
 #%end
 
 #%option G_OPT_F_INPUT
-#% key: loadfile
-#% label: Load model from file
+#% key: load_training
+#% label: Load training data from csv
 #% required: no
 #% guisection: Optional
 #%end
 
 #%option G_OPT_F_OUTPUT
-#% key: save_training
-#% label: Save training data to csv file
+#% key: save_model
+#% label: Save model from file
 #% required: no
 #% guisection: Optional
 #%end
 
 #%option G_OPT_F_INPUT
-#% key: load_training
-#% label: Load training data from file
+#% key: load_model
+#% label: Load model from file
 #% required: no
 #% guisection: Optional
 #%end
 
 #%rules
-#% exclusive: roi,loadfile
-#% exclusive: roi,load_training
+#% exclusive: roi,load_model
 #% exclusive: save_training,load_training
 #%end
 
-
-
 # import standard modules
-import atexit, random, string, re, os
+import atexit
+import os
 import numpy as np
 from subprocess import PIPE
 
 import grass.script as grass
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
 from grass.pygrass.modules.shortcuts import imagery as im
 
+from grass.pygrass.utils import set_path
+set_path('r.randomforest')
+from ml_utils import *
+from ml_classifiers import model_classifiers
 
+try:
+    import pandas as pd
+except:
+    grass.fatal("Pandas not installed")
 
-def cleanup():
+try:
+    import sklearn
+    from sklearn.externals import joblib
+    from sklearn import metrics
+    from sklearn import preprocessing
 
-    grass.run_command("g.remove", name='clfmasktmp', flags="f",
-                      type="raster", quiet=True)
+except:
+    grass.fatal("Scikit learn is not installed")
 
+# check for sklearn version
+if (sklearn.__version__) < 0.18:
+    grass.fatal("Scikit learn 0.18 or newer is required")
 
 
-def sample_predictors_byrow(response, predictors):
+def cleanup():
 
-    """
-    Samples a list of GRASS rasters using a labelled raster
-    Row-by-row sampling
-    
-    Parameters
-    ----------
-    response: String; GRASS raster with labelled pixels
-    predictors: List of GRASS rasters containing explanatory variables
+    grass.run_command("g.remove", name='tmp_clfmask',
+                      flags="f", type="raster", quiet=True)
+    grass.run_command("g.remove", name='tmp_roi_clumped',
+                      flags="f", type="raster", quiet=True)
 
-    Returns
-    -------
-    
-    training_data: Numpy array of extracted raster values
-    training_labels: Numpy array of labels
-    
-    """
 
-    # create response rasterrow and open
-    roi_raster = RasterRow(response)
-    roi_raster.open('r')
-    
-    # create a list of rasterrow objects
-    # Then each GRASS rasterrow can be referred to by rastack[band][row]:
-    n_features = len(predictors)
-    rasstack = [0] * n_features
-    for i in range(n_features):
-        rasstack[i] = RasterRow(predictors[i])
-        if rasstack[i].exist() == True:
-            rasstack[i].open('r')
-        else:
-            grass.fatal("GRASS raster " + maplist[i] +
-                        " does not exist.... exiting")
-        
-    # use grass.pygrass.gis.region to get information about the current region
-    current = Region()
+def main():
 
-    # determine cell storage type of training roi raster
-    roi_type = grass.read_command("r.info", map=response, flags='g')
-    roi_list = str(roi_type).split(os.linesep)
-    dtype = roi_list[9].split('=')[1]
-
-    # Count number of labelled pixels
-    roi_stats = str(grass.read_command("r.univar", flags=("g"), map=response))
-    roi_stats = roi_stats.split(os.linesep)[0]
-    nlabel_pixels = int(str(roi_stats).split('=')[1])
-
-    # Create a zero numpy array with the dimensions of the number of columns
-    # and the number of bands plus an additional band to attach the labels
-    tindex = 0
-    training_labels = []
-    training_data = np.zeros((nlabel_pixels, n_features+1))
-    training_data[:] = np.NAN
-
-    # Loop through each row of the raster
-    for row in range(current.rows):
-        # get the pixels from that row in the ROI raster
-        roi_row_np = roi_raster[row]
-
-        # check if any of those pixels are labelled (not equal to nodata)
-        # can use even if roi is FCELL because nodata will be nan and this is
-
-        # not returned anyway
-        is_train = np.nonzero(roi_row_np > -2147483648)
-        training_labels = np.append(training_labels, roi_row_np[is_train])
-        nlabels_in_row = np.array(is_train).shape[1]
-
-        # if there are any labelled pixels
-        # loop through each imagery band for that row and put the data into
-
-        # the img_row_band_np array
-        if np.isnan(roi_row_np).all() != True:
-
-            for band in range(n_features):
-                imagerow_np = rasstack[band][row]
-
-                # attach the label values onto the last column
-                training_data[tindex : tindex+nlabels_in_row, band] =\
-                    imagerow_np[is_train]
-
-            tindex = tindex + nlabels_in_row
-
-    # attach training label values onto last dimension of numpy array
-    training_data[0:nlabel_pixels, n_features] = training_labels
-
-    # convert any CELL maps no data vals to NaN in the training data
-    for i in range(n_features):
-        training_data[training_data[:, i] == -2147483648] = np.nan
-
-    # Remove nan rows from training data
-        training_data = training_data[~np.isnan(training_data).any(axis=1)]
-
-    # Split the numpy array into training_labels and training_data arrays
-    training_labels = training_data[:, n_features]
-    training_data = training_data[:, 0:n_features]
-
-    # close maps
-    roi_raster.close()
-    
-    for i in range(n_features):
-        rasstack[i].close()
-    
-    return(training_data, training_labels)
-
-
-
-def sample_predictors(response, predictors):
-    
     """
-    Samples a list of GRASS rasters using a labelled raster
-    Per raster sampling
-    
-    Parameters
-    ----------
-    response: String; GRASS raster with labelled pixels
-    predictors: List of GRASS rasters containing explanatory variables
-
-    Returns
-    -------
-    
-    training_data: Numpy array of extracted raster values
-    training_labels: Numpy array of labels
-    
-    """
-    
-    # open response raster as rasterrow and read as np array
-    if RasterRow(response).exist() == True:
-        roi_gr = RasterRow(response)
-        roi_gr.open('r')
-        response_np = np.array(roi_gr)
-    else:
-        grass.fatal("GRASS response raster does not exist.... exiting")
-
-    # check to see if all predictors exist
-    n_features = len(predictors)
-    
-    for i in range(n_features):
-        if RasterRow(predictors[i]).exist() != True:
-            grass.fatal("GRASS raster " + predictors[i]
-                        + " does not exist.... exiting")
-
-    # check if any of those pixels are labelled (not equal to nodata)
-    # can use even if roi is FCELL because nodata will be nan and this is not
-    # returned anyway
-    is_train = np.nonzero(response_np > -2147483648)
-    training_labels = response_np[is_train]
-    n_labels = np.array(is_train).shape[1]
-
-    # Create a zero numpy array of len training labels and n_features+1 for y
-    training_data = np.zeros((n_labels, n_features+1))
-
-    # Loop through each raster and sample pixel values at training indexes
-    for f in range(n_features):
-        predictor_gr = RasterRow(predictors[f])
-        predictor_gr.open('r')
-        feature_np = np.array(predictor_gr)
-        training_data[0:n_labels, f] = feature_np[is_train]
-        predictor_gr.close()
-    
-    # attach training labels to last column
-    training_data[0:n_labels, n_features] = training_labels
-
-    # convert any CELL maps no datavals to NaN in the training data
-    for i in range(n_features):
-        training_data[training_data[:, i] == -2147483648] = np.nan
-
-    # Remove nan rows from training data
-    training_data = training_data[~np.isnan(training_data).any(axis=1)]
-    roi_gr.close()
-    
-    # return X and y data
-    return(training_data[:, 0:n_features], training_data[:, n_features])
-
-
-
-def prediction(clf, predictors, class_probabilities,
-               rowincr, output, mode):
-    
-    """
-    Prediction on list of GRASS rasters using a fitted scikit learn model
-    
-    Parameters
-    ----------
-    clf: Scikit learn estimator object
-    predictors: List of paths to GDAL rasters that represent the predictor variables
-    output: Name of GRASS raster to output classification results
-    rowincr: Integer of raster rows to process at one time
-    class_probabilties: Boolean of whether probabilities of each class should also be predicted
-    mode: String, classification or regression mode
-    labels: Numpy array of the labels used for the classification
-    
-    """
-    
-    # create a list of rasterrow objects for predictors
-    n_features = len(predictors)
-    rasstack = [0] * n_features
-    for i in range(n_features):
-        rasstack[i] = RasterRow(predictors[i])
-        if rasstack[i].exist() == True:
-            rasstack[i].open('r')
-        else:
-            grass.fatal("GRASS raster " + maplist[i]
-                        + " does not exist.... exiting")
-    
-    # use grass.pygrass.gis.region to get information about the current region
-    current = Region()
-    
-    # create a imagery mask
-    # the input rasters might have different dimensions and non-value pixels.
-    # r.series used to automatically create a mask by propagating the nulls
-    clfmask = 'clfmasktmp'
-    grass.run_command("r.series", output=clfmask,
-                      input=predictors, method='count', flags='n')
-
-    mask_raster = RasterRow(clfmask)
-    mask_raster.open('r')
-    
-    # create and open RasterRow objects for classification
-    classification = RasterRow(output)
-    if mode == 'classification':
-        ftype = 'CELL'
-        nodata = -2147483648
-    else:
-        ftype = 'FCELL'
-        nodata = np.nan
-    classification.open('w', ftype, overwrite=True)
-    
-    # create and open RasterRow objects for  probabilities if enabled
-    if class_probabilities == True:
-        prob_out_raster = [0] * nclasses
-        prob = [0] * nclasses
-        for iclass in range(nclasses):
-            prob_out_raster[iclass] = output + \
-                '_classPr' + str(int(class_list[iclass]))
-            prob[iclass] = RasterRow(prob_out_raster[iclass])
-            prob[iclass].open('w', 'FCELL', overwrite=True)
-
-    """
-    Prediction using row blocks
-    """
-            
-    for rowblock in range(0, current.rows, rowincr):
-        # check that the row increment does not exceed the number of rows
-        if rowblock+rowincr > current.rows:
-            rowincr = current.rows - rowblock
-        img_np_row = np.zeros((rowincr, current.cols, n_features))
-        mask_np_row = np.zeros((rowincr, current.cols))
-
-        # loop through each row, and each band
-
-        # and add these values to the 2D array img_np_row
-        for row in range(rowblock, rowblock+rowincr, 1):
-            mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
-            for band in range(n_features):
-                img_np_row[row-rowblock, :, band] = \
-                    np.array(rasstack[band][row])
-
-        mask_np_row[mask_np_row == -2147483648] = np.nan
-        nanmask = np.isnan(mask_np_row) # True in the mask means invalid data
-
-        # reshape each row-band matrix into a list
-        nsamples = rowincr * current.cols
-        flat_pixels = img_np_row.reshape((nsamples, n_features))
-
-        # remove NaN values and perform the prediction
-        flat_pixels = np.nan_to_num(flat_pixels)
-        result = clf.predict(flat_pixels)
-        result = result.reshape((rowincr, current.cols))
-
-        # replace NaN values so that the prediction does not have a border
-        result = np.ma.masked_array(result, mask=nanmask, fill_value=np.nan)
-
-        # return a copy of result, with masked values filled with a value
-        result = result.filled([nodata])
-
-        # for each row we can perform computation, and write the result into
-        for row in range(rowincr):
-            newrow = Buffer((result.shape[1],), mtype=ftype)
-            newrow[:] = result[row, :]
-            classification.put_row(newrow)
-
-        # same for probabilities
-        if class_probabilities == True and mode == 'classification':
-            result_proba = clf.predict_proba(flat_pixels)
-
-            for iclass in range(result_proba.shape[1]):
-                result_proba_class = result_proba[:, iclass]
-                result_proba_class = \
-                    result_proba_class.reshape((rowincr, current.cols))
-                    
-                result_proba_class = \
-                    np.ma.masked_array(result_proba_class,
-                                       mask=nanmask, fill_value=np.nan)
-                result_proba_class = result_proba_class.filled([np.nan])
-
-                for row in range(rowincr):
-                    newrow = Buffer((result_proba_class.shape[1],),
-                                     mtype='FCELL')
-                    newrow[:] = result_proba_class[row, :]
-                    prob[iclass].put_row(newrow)
-
-    classification.close()
-    mask_raster.close()
-
-    if class_probabilities == True and mode == 'classification':
-        for iclass in range(nclasses): prob[iclass].close()
-
-
-
-def shuffle_data(X, y, rstate):
-
-    """
-    Uses scikit learn to shuffle data
-    
-    Parameters
-    ----------
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-    rstate: Seed for random generator
-    
-    Returns
-    -------
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-
-    """
-
-    from sklearn.utils import shuffle
-
-    # combine XY data into a single numpy array
-    XY = np.empty((X.shape[0], X.shape[1]+1))
-    XY[:,0] = y
-    XY[:,1:] = X
-    
-    XY = shuffle(XY, random_state=rstate)
-
-    # split XY into train_xs and train_y
-    X = XY[:,1:]
-    y = XY[:,0]
-    
-    return(X, y)
-
-
-
-def cross_val_classification(clf, X, y, cv, rstate):
-    
-    """
-    Stratified Kfold cross-validation
-    Generates several scoring_metrics without the need to repeatedly use cross_val_score
-    Also produces by-class scores
-    
-    Parameters
-    ----------
-    clf: Scikit learn estimator object
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-    cv: Integer of cross-validation folds
-    rstate: Seed to pass to the random number generator
-    
-    Returns
-    -------
-    cmstats: Dictionary of global accuracy measures per fold
-    byclass_metrics: Dictionary of by-class accuracy measures per fold
-
-    """
-    
-    from sklearn import cross_validation, metrics
-    
-    class_list = np.unique(y)
-    nclasses = len(np.unique(y))
-    
-    # Store performance measures per class
-    byclass_metrics = {
-        'precision': np.zeros((nclasses, cv)),
-        'recall': np.zeros((nclasses, cv)),
-        'f1': np.zeros((nclasses, cv))
-    }
-    
-    # generate Kfold indices
-    k_fold = cross_validation.StratifiedKFold(y, n_folds=cv,
-                                              shuffle=False,
-                                              random_state=rstate)
-
-    # dictionary of lists to store metrics
-    cmstats = {
-        'accuracy': [],
-        'precision': [],
-        'recall': [],
-        'f1': [],
-        'kappa': [],
-        'auc': []
-    }
-
-    # loop through each fold
-    fold=0
-    for train_indices, test_indices in k_fold:
-
-        X_train, X_test =  X[train_indices], X[test_indices]
-        y_train, y_test = y[train_indices], y[test_indices]
-
-        # fit the model on the training data and predict the test data
-        fit = clf.fit(X_train, y_train)
-        y_pred = fit.predict(X_test)
-
-        # calculate metrics
-        cmstats['accuracy'] = np.append(cmstats['accuracy'],
-                                        metrics.accuracy_score(y_test, y_pred))
-
-        cmstats['precision'] = np.append(cmstats['precision'],
-                                         metrics.precision_score(y_test,
-                                                                 y_pred,
-                                                                 average='weighted'))
-
-
-        cmstats['recall'] = np.append(cmstats['recall'],
-                                     metrics.recall_score(y_test, y_pred,
-                                                          average='weighted'))
-
-        cmstats['f1'] = np.append(cmstats['f1'],
-                                  metrics.f1_score(y_test, y_pred,
-                                                   average='weighted'))
-
-        cmstats['kappa'] = np.append(cmstats['kappa'],
-                                     metrics.cohen_kappa_score(y_test, y_pred))
-
-        # test for if binary and classes equal 0,1
-        if len(np.unique(y)) == 2 and all ([0,1] == np.unique(y)):
-            
-            stat_method = "binary"
-            
-            y_pred_proba = fit.predict_proba(X_test)[:,1]
-            cmstats['auc'] = np.append(cmstats['auc'],
-                                       metrics.roc_auc_score(y_test,
-                                                             y_pred_proba))
-        else:
-            stat_method = "micro"
-        
-        # Get performance measures by class     
-        for cindex in range(nclasses):
-            byclass_metrics['recall'][cindex, fold] = \
-                metrics.recall_score(y_test, y_pred, labels =
-                                    [class_list[cindex]],
-                                    pos_label=class_list[cindex], average = stat_method)
-
-            byclass_metrics['precision'][cindex, fold] = \
-                metrics.precision_score(y_test, y_pred, labels =
-                                        [class_list[cindex]],
-                                        pos_label=class_list[cindex], average = stat_method)
-
-            byclass_metrics['f1'][cindex, fold] = \
-                metrics.f1_score(y_test, y_pred, labels =
-                                [class_list[cindex]],
-                                pos_label=class_list[cindex], average = stat_method)
-        fold+=1
-        
-    return(cmstats, byclass_metrics)
-
-def save_training_data(X, y, file):
-
-    """
-    Saves any extracted training data to a csv file
-    
-    Parameters
-    ----------
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-    file: Path to a csv file to save data to
-
-    """
-
-    training_data = np.zeros((y.shape[0], X.shape[1]+1))
-    training_data[:, 0:X.shape[1]] = X
-    training_data[:, X.shape[1]] = y
-    np.savetxt(file, training_data, delimiter = ',')
-
-
-
-def load_training_data(file):
-    
-    """
-    Loads training data and labels from a csv file
-    
-    Parameters
-    ----------
-    file: Path to a csv file to save data to
-    
-    Returns
-    -------
-    X: Numpy array containing predictor values
-    y: Numpy array containing labels
-
-    """
-    
-    training_data = np.loadtxt(file, delimiter = ',')
-    n_features = training_data.shape[1]
-    X = training_data[:, 0:n_features-1]
-    y = training_data[:, n_features-1]
-    
-    return(X, y)
-
-def main():
-    
-    """
     GRASS options and flags
     -----------------------
     """
-    
+
     # General options and flags
     igroup = options['igroup']
     roi = options['roi']
     output = options['output']
     model = options['model']
+    norm_data = flags['n']
     cv = int(options['cv'])
+    cvtype = options['cvtype']
     modelonly = flags['m']
-    class_probabilities = flags['p']
+    probability = flags['p']
     rowincr = int(options['lines'])
-    randst = int(options['randst'])
-    model_save = options['savefile']
-    model_load = options['loadfile']
-    save_training = options['save_training']
+    random_state = int(options['random_state'])
+    model_save = options['save_model']
+    model_load = options['load_model']
     load_training = options['load_training']
+    save_training = options['save_training']
     importances = flags['f']
-    weighting = flags['b']
-    lowmem = flags['l']
-    ncores = int(options['ncores'])
+    tuning = flags['h']
+    shell = flags['g']
+    ratio = float(options['ratio'])
+    errors_file = options['errors_file']
+    fimp_file = options['fimp_file']
 
-    # logistic regression
-    c_lr = float(options['c_lr'])
-    fi = flags['i']
-    
-    # decision trees
-    splitter_dt = options['splitter_dt']
-    m_features_dt = int(options['m_features_dt'])
-    min_samples_split_dt = int(options['min_samples_split_dt'])
-    min_samples_leaf_dt = int(options['min_samples_leaf_dt'])
-    min_weight_fraction_leaf_dt = float(options['min_weight_fraction_leaf_dt'])
-    
-    # random forests
-    ntrees_rf = int(options['ntrees_rf'])
-    m_features_rf = int(options['m_features_rf'])
-    minsplit_rf = int(options['minsplit_rf'])
-    
-    # gradient tree boosting
-    learning_rate_gtb = float(options['learning_rate_gtb'])
-    n_estimators_gtb = int(options['n_estimators_gtb'])
-    max_depth_gtb = int(options['max_depth_gtb'])
-    min_samples_split_gtb = int(options['min_samples_split_gtb'])
-    min_samples_leaf_gtb = int(options['min_samples_leaf_gtb'])
-    min_weight_fraction_leaf_gtb = float(options['min_weight_fraction_leaf_gtb'])
-    subsample_gtb = float(options['subsample_gtb'])
-    max_features_gtb = int(options['max_features_gtb'])
-    
-    # classification or regression
-    if model == 'LogisticRegression' \
-    or model == 'DecisionTreeClassifier' \
-    or model == 'RandomForestClassifier' \
-    or model == 'GradientBoostingClassifier' \
-    or model == 'GaussianNB' \
-    or model == 'LinearDiscriminantAnalysis' \
-    or model == 'QuadraticDiscriminantAnalysis':
-        mode = 'classification'
+    if flags['b'] is True:
+        class_weight = 'balanced'
     else:
-        mode = 'regression'
+        class_weight = None
 
+    # classifier options
+    C = float(options['c'])
+    min_samples_split = int(options['min_samples_split'])
+    min_samples_leaf = int(options['min_samples_leaf'])
+    n_estimators = int(options['n_estimators'])
+    learning_rate = float(options['learning_rate'])
+    subsample = float(options['subsample'])
+    max_depth = int(options['max_depth'])
+    max_features = int(options['max_features'])
 
+    if max_features == -1:
+        max_features = str('auto')
+    if max_depth == -1:
+        max_depth = None
+        
     """
-    Error checking for valid input parameters
-    -----------------------------------------
-    """
-
-    # decision trees
-    if model == 'DecisionTreeClassifier' or model == 'DecisionTreeRegressor':
-        if m_features_dt == -1:
-            m_features_dt = str('auto')
-        if m_features_dt == 0:
-            grass.fatal("m_features_dt must be greater than zero, or -1 which uses the sqrt(nfeatures)...exiting")
-        if min_samples_split_dt < 1:
-            grass.fatal("min_samples_split_dt must be >=1.....exiting")
-        if min_samples_leaf_dt < 1:
-            grass.fatal("min_samples_leaf_dt must be >=1.....exiting")
-
-    # random forests
-    if model == 'RandomForestClassifier' or model == 'RandomForestRegressor':
-        if m_features_rf == -1:
-            m_features_rf = str('auto')
-        if m_features_rf == 0:
-            grass.fatal("mfeatures must be greater than zero, or -1 which uses the sqrt(nfeatures)...exiting")
-        if minsplit_rf < 1:
-            grass.fatal("minsplit must be greater than zero.....exiting")
-        if ntrees_rf < 1:
-            grass.fatal("ntrees must be greater than zero.....exiting")
-
-    # gradient tree boosting
-    if model == 'GradientBoostingClassifier' or model == 'GradientBoostingRegressor':
-        if n_estimators_gtb < 1:
-            grass.fatal("n_estimators_gtb must be greater than zero...exiting")
-        if max_depth_gtb < 1:
-            grass.fatal("max_depth_gtb must be greater than zero...exiting")
-        if min_samples_split_gtb < 1:
-            grass.fatal("min_samples_split_gtb must be greater than zero...exiting")
-        if min_samples_leaf_gtb < 1:
-            grass.fatal("min_samples_leaf_gtb must be greater than zero...exiting")
-        if max_features_gtb == -1:
-            max_features_gtb = str('auto')
-        if max_features_gtb == 0:
-            grass.fatal("max_features_gtb must be greater than zero, or -1 which uses the sqrt(nfeatures)...exiting")
-
-    # general options
-    if rowincr <= 0:
-        grass.fatal("rowincr must be greater than zero....exiting")
-    if model_save != '' and model_load != '':
-        grass.fatal("Cannot save and load a model at the same time.....exiting")
-    if model_load == '' and roi == '':
-        grass.fatal("Require labelled pixels regions of interest.....exiting")
-    if weighting == True:
-        weighting = 'balanced'
-    else:
-        weighting = None
-
-
-    """
     Obtain information about GRASS rasters to be classified
     -------------------------------------------------------
     """
-    
+
+    Id = None
+
     # fetch individual raster names from group
     groupmaps = im.group(group=igroup, flags="g",
-                         quiet = True, stdout_=PIPE).outputs.stdout
-                         
+                         quiet=True, stdout_=PIPE).outputs.stdout
+
     maplist = groupmaps.split(os.linesep)
     maplist = maplist[0:len(maplist)-1]
     n_features = len(maplist)
 
-    # use grass.pygrass.gis.region to get information about the current region
-    current = Region()
+    # Error checking for m_features settings
+    if max_features > n_features:
+        max_features = n_features
 
-    # lazy import of sklearn
-    try:
-        from sklearn.externals import joblib
-        import warnings
-        warnings.filterwarnings("ignore")
-    except:
-        grass.fatal("Scikit-learn python module is not installed...exiting")
-
-
     """
     Sample training data using training ROI
     ---------------------------------------
     """
-    
-    # load the model or training data
-    if model_load != '':
-        clf = joblib.load(model_load)
-    else:
-        if load_training != '':
-            X, y = load_training_data(load_training)
-        else:
-            # query predictor rasters with training features
-            if lowmem != True:
-                X, y = sample_predictors(roi, maplist)
-            else:
-                X, y = sample_predictors_byrow(roi, maplist)
-            
-            # shuffle the training data
-            X, y = shuffle_data(X, y, rstate=randst)
-            
-            # use GRASS option to save the training data
-            if save_training != '': save_training_data(X, y, save_training)
 
-        # determine the number of class labels using np.unique
-        nclasses = len(np.unique(y))
-        class_list = np.unique(y)
-    
-    # Error checking for m_features settings
-    if m_features_dt > n_features: m_features_dt = n_features
-    if m_features_rf > n_features: m_features_rf = n_features
-    if max_features_gtb > n_features: max_features_gtb = n_features
+    # load or sample training data
+    X, y, Id, clf = sample_training_data(roi, maplist, cv, cvtype, model_load,
+                                         model_save, load_training,
+                                         save_training, random_state)
 
+    # determine the number of class labels using np.unique
+    labels = np.unique(y)
 
     """
+    Data preprocessing
+    --------------------
+    """
+    if norm_data is True:
+        scaler = preprocessing.StandardScaler().fit(X)
+        X = scaler.transform(X)
+    else:
+        scaler = None
+
+    """
     Train the classifier
     --------------------
     """
 
     # define classifier unless model is to be loaded from file
     if model_load == '':
-        
-        # classifiers
-        from sklearn.linear_model import LogisticRegression
-        from sklearn.tree import DecisionTreeClassifier
-        from sklearn.ensemble import RandomForestClassifier
-        from sklearn.ensemble import GradientBoostingClassifier
-        from sklearn.naive_bayes import GaussianNB
-        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
-        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
-        
-        from sklearn.neighbors import KNeighborsRegressor
-        from sklearn.tree import DecisionTreeRegressor
-        from sklearn.ensemble import RandomForestRegressor
-        from sklearn.ensemble import GradientBoostingRegressor
-        
-        classifiers = {
-            'LogisticRegression': LogisticRegression(C=c_lr, fit_intercept=fi, n_jobs=ncores, class_weight=weighting),
-            'DecisionTreeClassifier': DecisionTreeClassifier(splitter=splitter_dt,
-                                                             max_features=m_features_dt,
-                                                             min_samples_split=min_samples_split_dt,
-                                                             min_samples_leaf=min_samples_leaf_dt,
-                                                             min_weight_fraction_leaf=min_weight_fraction_leaf_dt,
-                                                             random_state=randst,
-                                                             class_weight=weighting),
-            'DecisionTreeRegressor': DecisionTreeRegressor(splitter=splitter_dt,
-                                                           max_features=m_features_dt,
-                                                           min_samples_split=min_samples_split_dt,
-                                                           min_samples_leaf=min_samples_leaf_dt,
-                                                           min_weight_fraction_leaf=min_weight_fraction_leaf_dt,
-                                                           random_state=randst),
-            'RandomForestClassifier': RandomForestClassifier(n_estimators=ntrees_rf,
-                                                             oob_score=True,
-                                                             max_features=m_features_rf,
-                                                             min_samples_split=minsplit_rf,
-                                                             random_state=randst,
-                                                             n_jobs=ncores,
-                                                             class_weight=weighting),
-            'RandomForestRegressor': RandomForestRegressor(n_jobs=ncores,
-                                                           n_estimators=ntrees_rf,
-                                                           oob_score=False,
-                                                           max_features=m_features_rf,
-                                                           min_samples_split=minsplit_rf,
-                                                           random_state=randst),
-            'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=learning_rate_gtb,
-                                                                     n_estimators=n_estimators_gtb,
-                                                                     max_depth=max_depth_gtb,
-                                                                     min_samples_split=min_samples_split_gtb,
-                                                                     min_samples_leaf=min_samples_leaf_gtb,
-                                                                     min_weight_fraction_leaf=min_weight_fraction_leaf_gtb,
-                                                                     subsample=subsample_gtb,
-                                                                     max_features=max_features_gtb,
-                                                                     random_state=randst),
-            'GradientBoostingRegressor': GradientBoostingRegressor(learning_rate=learning_rate_gtb,
-                                                                   n_estimators=n_estimators_gtb,
-                                                                   max_depth=max_depth_gtb,
-                                                                   min_samples_split=min_samples_split_gtb,
-                                                                   min_samples_leaf=min_samples_leaf_gtb,
-                                                                   min_weight_fraction_leaf=min_weight_fraction_leaf_gtb,
-                                                                   subsample=subsample_gtb,
-                                                                   max_features=max_features_gtb,
-                                                                   random_state=randst),
-            'GaussianNB': GaussianNB(),
-            'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
-            'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
-        }
-        
-        # define classifier
-        clf = classifiers[model]
-        
-        # train classifier
-        clf.fit(X, y)
-        grass.message(_("Model built with: " + model))
-        
-        
+
+        grass.message("Model=" + model)
+        clf, param_grid, mode =\
+            model_classifiers(model, random_state,
+                              class_weight, C, max_depth,
+                              max_features, min_samples_split,
+                              min_samples_leaf, n_estimators,
+                              subsample, learning_rate)
+
+        # data splitting for automatic parameter tuning
+        if tuning is True:
+            X, X_devel, y, y_devel, Id, Id_devel, clf = \
+                tune_split(X, y, Id, clf, param_grid, ratio, random_state)
+
+            if shell is False:
+                grass.message('\n')
+                grass.message('Searched parameters:')
+                grass.message(str(clf.param_grid))
+                grass.message('\n')
+                grass.message('Best parameters:')
+                grass.message(str(clf.best_params_))
+
+            clf = clf.best_estimator_
+
         """
         Cross Validation
         ----------------
         """
 
-        # output internal performance measures for random forests
-        if model == 'RandomForestClassifier':
-            grass.message(_("\r\n"))
-            grass.message(_('RF OOB prediction  accuracy: \t %0.3f' %
-                            (clf.oob_score_ * 100)))
-        if model == 'RandomForestRegressor':
-            grass.message(_("\r\n"))
-            grass.message(_('Coefficient of determination R^2: \t %0.3f' %
-                         (clf.score(X=training_data, y=training_labels))))
-       
         # If cv > 1 then use cross-validation to generate performance measures
         if cv > 1:
-            from sklearn.cross_validation import cross_val_predict, cross_val_score
-            from sklearn.metrics import classification_report
 
-            grass.message(_('\r\n'))
-            grass.message(_("Cross validation global performance measures......:"))
+            grass.message('\r\n')
+            grass.message(
+                "Cross validation global performance measures......:")
 
+            # get metrics from kfold cross validation
+            scores, y_test, y_pred = cross_val_classification(
+                clf, X, y, Id, cv, random_state)
+
             if mode == 'classification':
-                # get metrics from kfold cross validation
-                cmstats, byclass_stats = cross_val_classification(clf, X, y, cv, randst)
-                
-                grass.message(_("Accuracy                  :\t%0.2f\t+/-SD\t%0.2f" \
-                    %(cmstats['accuracy'].mean(), cmstats['accuracy'].std())))
-                grass.message(_("Kappa                     :\t%0.2f\t+/-SD\t%0.2f" \
-                    % (cmstats['kappa'].mean(), cmstats['kappa'].std())))
-                grass.message(_("Precision weighted average:\t%0.2f\t+/-SD\t%0.2f" \
-                    % (cmstats['precision'].mean(), cmstats['precision'].std())))
-                grass.message(_("Recall weighted average   :\t%0.2f\t+/-SD\t%0.2f" \
-                    % (cmstats['recall'].mean(), cmstats['recall'].std())))
-                grass.message(_("f1 weighted average       :\t%0.2f\t+/-SD\t%0.2f" \
-                    % (cmstats['f1'].mean(), cmstats['f1'].std())))
-                
+
+                grass.message(
+                    "Accuracy:\t%0.2f\t+/-SD\t%0.2f" %
+                    (scores['accuracy'].mean(), scores['accuracy'].std()))
+
                 # test for if binary and classes equal 0,1
-                if len(np.unique(y)) == 2 and all ([0,1] == np.unique(y)):
-                    grass.message(_("ROC AUC                   :\t%0.2f\t+/-SD\t%0.2f"\
-                        % (cmstats['auc'].mean(), cmstats['auc'].std())))
-                
-                # calculate scores by class
-                grass.message(_("\r\n"))
-                grass.message(_("Performance measures by class......:"))
-                grass.message(_("Cla\tRecall\t2SD \tPrecision\t2SD\tf1  \tSD"))
+                if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+                    grass.message(
+                        "ROC AUC :\t%0.2f\t+/-SD\t%0.2f" %
+                        (scores['auc'].mean(), scores['auc'].std()))
 
-                byclass_scores = ['recall', 'precision', 'f1']
-                
-                for i in range(nclasses):
-                    row = str(int(class_list[i])) + '\t'
-                    for method in byclass_scores:
-                        row += ('%.2f' % byclass_stats[method][i, :].mean() ) +\
-                        '\t' + ('%.2f' % byclass_stats[method][i, :].std() ) + '\t'
-                    grass.message(_(row))
-                
+                # classification report
+                grass.message("\n")
+                grass.message("Classification report:")
+                grass.message(metrics.classification_report(y_test, y_pred))
+
+                if errors_file != '':
+                    errors = pd.DataFrame({'accuracy': scores['accuracy']})
+                    errors.to_csv(errors_file, mode='w')
+
             else:
-                r2 = cross_val_score(clf, X, y, cv=cv, scoring='r2', n_jobs=ncores)
-                grass.message(_("R2:\t%0.2f\t+/-\t%0.2f" % (r2.mean(), r2.std() * 2)))
-                
-        # diagnostics
-        if importances == True:
-            if (model == 'RandomForestClassifier' or 
-                model == 'GradientBoostingClassifier' or
-                model == 'RandomForestRegressor' or
-                model == 'GradientBoostingRegressor' or
-                model == 'DecisionTreeClassifier' or
-                model == 'DecisionTreeRegressor'):
-                    
-                    clfimp = clf.feature_importances_
-                    grass.message(_("\r\n"))
-                    grass.message(_("Feature importances"))
-                    grass.message(_("id" + "\t" + "Raster" + "\t" + "Importance"))
-                    
-                    for i in range(len(clfimp)):
-                         grass.message(_(str(i) + "\t" + maplist[i]
-                                         + "\t" + str(round(clfimp[i], 4))))
-        
-        # save the model
+                grass.message("R2:\t%0.2f\t+/-\t%0.2f" %
+                              (scores['r2'].mean(), scores['r2'].std()))
+
+                if errors_file != '':
+                    errors = pd.DataFrame({'r2': scores['accuracy']})
+                    errors.to_csv(errors_file, mode='w')
+
+        # train classifier
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+
+        # check that all classes can be predicted
+        # otherwise update labels with only predictable classes
+        # otherwise the predicted probabilties will fail
+        test = len(np.unique(y_pred)) == len(labels)
+        if test is False:
+            labels = np.unique(y_pred)
+
+        """
+        Feature Importances
+        -------------------
+        """
+
+        if importances is True:
+
+            clfimp = feature_importances(clf, X, y)
+
+            # output to GRASS message
+            grass.message("\r\n")
+            grass.message("Normalized feature importances")
+            grass.message("id" + "\t" + "Raster" + "\t" + "Importance")
+
+            for i in range(len(clfimp)):
+                grass.message(
+                    str(i) + "\t" + maplist[i] +
+                    "\t" + str(round(clfimp[i], 4)))
+
+            if fimp_file != '':
+                fimp_output = pd.DataFrame(
+                    {'grass raster': maplist, 'importance': clfimp})
+                fimp_output.to_csv(
+                    path_or_buf=fimp_file,
+                    header=['grass raster', 'importance'])
+
+        """
+        Save the fitted model
+        ---------------------
+        """
+
         if model_save != '':
             joblib.dump(clf, model_save + ".pkl")
-        
-        if modelonly == True:
+
+        if modelonly is True:
             grass.fatal("Model built and now exiting")
 
-
     """
     Prediction on the rest of the GRASS rasters in the imagery group
     ----------------------------------------------------------------
     """
-    
-    prediction(clf, maplist, class_probabilities, rowincr, output, mode)
-    
 
+    prediction(clf, labels, maplist, scaler, probability,
+               rowincr, output, mode)
+
+
 if __name__ == "__main__":
     options, flags = grass.parser()
     atexit.register(cleanup)



More information about the grass-commit mailing list