[GRASS-SVN] r70075 - grass-addons/grass7/raster/r.randomforest
svn_grass at osgeo.org
svn_grass at osgeo.org
Wed Dec 14 09:56:10 PST 2016
Author: spawley
Date: 2016-12-14 09:56:10 -0800 (Wed, 14 Dec 2016)
New Revision: 70075
Modified:
grass-addons/grass7/raster/r.randomforest/Makefile
grass-addons/grass7/raster/r.randomforest/r.randomforest.py
Log:
'r.randomforest refactoring'
Modified: grass-addons/grass7/raster/r.randomforest/Makefile
===================================================================
--- grass-addons/grass7/raster/r.randomforest/Makefile 2016-12-14 05:30:02 UTC (rev 70074)
+++ grass-addons/grass7/raster/r.randomforest/Makefile 2016-12-14 17:56:10 UTC (rev 70075)
@@ -2,9 +2,6 @@
PGM = r.randomforest
-ETCFILES = ml_classifiers ml_utils
-
include $(MODULE_TOPDIR)/include/Make/Script.make
-include $(MODULE_TOPDIR)/include/Make/Python.make
default: script
Modified: grass-addons/grass7/raster/r.randomforest/r.randomforest.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-12-14 05:30:02 UTC (rev 70074)
+++ grass-addons/grass7/raster/r.randomforest/r.randomforest.py 2016-12-14 17:56:10 UTC (rev 70075)
@@ -247,20 +247,19 @@
#% exclusive: save_training,load_training
#%end
-# import standard modules
import atexit
import os
import numpy as np
-from subprocess import PIPE
-
import grass.script as grass
+import tempfile
+import copy
from grass.pygrass.modules.shortcuts import imagery as im
+from subprocess import PIPE
+from grass.pygrass.raster import RasterRow
+from grass.pygrass.gis.region import Region
+from grass.pygrass.raster.buffer import Buffer
+from grass.pygrass.modules.shortcuts import raster as r
-from grass.pygrass.utils import set_path
-set_path('r.randomforest')
-from ml_utils import *
-from ml_classifiers import model_classifiers
-
try:
import pandas as pd
except:
@@ -271,11 +270,28 @@
from sklearn.externals import joblib
from sklearn import metrics
from sklearn import preprocessing
+ from sklearn.model_selection import StratifiedKFold
+ from sklearn.model_selection import GroupKFold
+ from sklearn.model_selection import train_test_split
+ from sklearn.model_selection import GridSearchCV
+ from sklearn.feature_selection import SelectKBest
+ from sklearn.feature_selection import f_classif
+ from sklearn.utils import shuffle
+ from sklearn.cluster import KMeans
+
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
+ from sklearn.naive_bayes import GaussianNB
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+ from sklearn.ensemble import GradientBoostingClassifier
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.svm import SVC
except:
grass.fatal("Scikit learn is not installed")
-# check for sklearn version
if (sklearn.__version__) < 0.18:
grass.fatal("Scikit learn 0.18 or newer is required")
@@ -288,6 +304,574 @@
flags="f", type="raster", quiet=True)
+def model_classifiers(estimator, random_state, class_weight,
+ C, max_depth, max_features,
+ min_samples_split, min_samples_leaf,
+ n_estimators, subsample, learning_rate):
+
+ classifiers = {
+ 'SVC': SVC(C=C, probability=True, random_state=random_state),
+ 'LogisticRegression': LogisticRegression(C=C, class_weight=class_weight,
+ random_state=random_state),
+ 'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=max_depth,
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state,
+ class_weight=class_weight),
+ 'DecisionTreeRegressor': DecisionTreeRegressor(max_features=max_features,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ random_state=random_state),
+ 'RandomForestClassifier': RandomForestClassifier(n_estimators=n_estimators,
+ class_weight=class_weight,
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ random_state=random_state,
+ n_jobs=-1,
+ oob_score=True),
+ 'RandomForestRegressor': RandomForestRegressor(n_estimators=n_estimators,
+ max_features=max_features,
+ min_samples_split=min_samples_split,
+ random_state=random_state,
+ n_jobs=-1,
+ oob_score=True),
+ 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=learning_rate,
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ subsample=subsample,
+ max_features=max_features,
+ random_state=random_state),
+ 'GradientBoostingRegressor': GradientBoostingRegressor(learning_rate=learning_rate,
+ n_estimators=n_estimators,
+ max_depth=max_depth,
+ min_samples_split=min_samples_split,
+ min_samples_leaf=min_samples_leaf,
+ subsample=subsample,
+ max_features=max_features,
+ random_state=random_state),
+ 'GaussianNB': GaussianNB(),
+ 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
+ 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
+ }
+
+ SVCOpts = {'C': [1, 10, 100], 'shrinking': [True, False]}
+ LogisticRegressionOpts = {'C': [1, 10, 100, 1000]}
+ DecisionTreeOpts = {'max_depth': [2, 4, 6, 8, 10, 20],
+ 'max_features': ['sqrt', 'log2', None],
+ 'min_samples_split': [2, 0.01, 0.05, 0.1, 0.25]}
+ RandomForestOpts = {'max_features': ['sqrt', 'log2', None]}
+ GradientBoostingOpts = {'learning_rate': [0.01, 0.02, 0.05, 0.1],
+ 'max_depth': [3, 4, 6, 10],
+ 'max_features': ['sqrt', 'log2', None],
+ 'n_estimators': [50, 100, 150, 250, 500]}
+
+ param_grids = {
+ 'SVC': SVCOpts,
+ 'LogisticRegression': LogisticRegressionOpts,
+ 'DecisionTreeClassifier': DecisionTreeOpts,
+ 'DecisionTreeRegressor': DecisionTreeOpts,
+ 'RandomForestClassifier': RandomForestOpts,
+ 'RandomForestRegressor': RandomForestOpts,
+ 'GradientBoostingClassifier': GradientBoostingOpts,
+ 'GradientBoostingRegressor': GradientBoostingOpts,
+ 'GaussianNB': {},
+ 'LinearDiscriminantAnalysis': {},
+ 'QuadraticDiscriminantAnalysis': {},
+ }
+
+ # define classifier
+ clf = classifiers[estimator]
+ params = param_grids[estimator]
+
+ # classification or regression
+ if estimator == 'LogisticRegression' \
+ or estimator == 'DecisionTreeClassifier' \
+ or estimator == 'RandomForestClassifier' \
+ or estimator == 'GradientBoostingClassifier' \
+ or estimator == 'GaussianNB' \
+ or estimator == 'LinearDiscriminantAnalysis' \
+ or estimator == 'QuadraticDiscriminantAnalysis' \
+ or estimator == 'EarthClassifier' \
+ or estimator == 'SVC':
+ mode = 'classification'
+ else:
+ mode = 'regression'
+
+ return (clf, params, mode)
+
+
+def save_training_data(X, y, groups, file):
+
+ """
+ Saves any extracted training data to a csv file
+
+ Parameters
+ ----------
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels
+ file: Path to a csv file to save data to
+
+ """
+
+ # if there are no group labels, create a nan filled array
+ if groups is None:
+ groups = np.empty((y.shape[0]))
+ groups[:] = np.nan
+
+ training_data = np.zeros((y.shape[0], X.shape[1]+2))
+ training_data[:, 0:X.shape[1]] = X
+ training_data[:, X.shape[1]] = y
+ training_data[:, X.shape[1]+1] = groups
+
+ np.savetxt(file, training_data, delimiter=',')
+
+
+def load_training_data(file):
+
+ """
+ Loads training data and labels from a csv file
+
+ Parameters
+ ----------
+ file: Path to a csv file to save data to
+
+ Returns
+ -------
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ groups: Numpy array of group labels, or None
+
+ """
+
+ training_data = np.loadtxt(file, delimiter=',')
+ n_features = training_data.shape[1]-1
+
+ # check to see if last column contains group labels or nans
+ groups = training_data[:, -1]
+ training_data = training_data[:, 0:n_features]
+
+ if np.isnan(groups).all() is True:
+ # if all nans then ignore last column
+ groups = None
+
+ # fetch X and y
+ X = training_data[:, 0:n_features-1]
+ y = training_data[:, -1]
+
+ return(X, y, groups)
+
+
+def sample_predictors(response, predictors, shuffle_data, lowmem, random_state):
+
+ """
+ Samples a list of GRASS rasters using a labelled raster
+ Per raster sampling
+
+ Parameters
+ ----------
+ response: String; GRASS raster with labelled pixels
+ predictors: List of GRASS rasters containing explanatory variables
+
+ Returns
+ -------
+
+ training_data: Numpy array of extracted raster values
+ training_labels: Numpy array of labels
+ y_indexes: Row and Columns of label positions
+
+ """
+ current = Region()
+
+ # open response raster as rasterrow and read as np array
+ if RasterRow(response).exist() is True:
+ roi_gr = RasterRow(response)
+ roi_gr.open('r')
+
+ if lowmem is False:
+ response_np = np.array(roi_gr)
+ else:
+ response_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+ response_np[:] = np.array(roi_gr)[:]
+ else:
+ grass.fatal("GRASS response raster does not exist.... exiting")
+
+ # determine number of predictor rasters
+ n_features = len(predictors)
+
+ # check to see if all predictors exist
+ for i in range(n_features):
+ if RasterRow(predictors[i]).exist() is not True:
+ grass.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # check if any of those pixels are labelled (not equal to nodata)
+ # can use even if roi is FCELL because nodata will be nan
+ is_train = np.nonzero(response_np > -2147483648)
+ training_labels = response_np[is_train]
+ n_labels = np.array(is_train).shape[1]
+
+ # Create a zero numpy array of len training labels
+ if lowmem is False:
+ training_data = np.zeros((n_labels, n_features))
+ else:
+ training_data = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(n_labels, n_features))
+
+ # Loop through each raster and sample pixel values at training indexes
+ if lowmem is True:
+ feature_np = np.memmap(tempfile.NamedTemporaryFile(),
+ dtype='float32', mode='w+',
+ shape=(current.rows, current.cols))
+
+ for f in range(n_features):
+ predictor_gr = RasterRow(predictors[f])
+ predictor_gr.open('r')
+
+ if lowmem is False:
+ feature_np = np.array(predictor_gr)
+ else:
+ feature_np[:] = np.array(predictor_gr)[:]
+
+ training_data[0:n_labels, f] = feature_np[is_train]
+ predictor_gr.close()
+
+ # convert any CELL maps no datavals to NaN in the training data
+ for i in range(n_features):
+ training_data[training_data[:, i] == -2147483648] = np.nan
+
+ # convert indexes of training pixels from tuple to n*2 np array
+ is_train = np.array(is_train).T
+
+ # Remove nan rows from training data
+ X = training_data[~np.isnan(training_data).any(axis=1)]
+ y = training_labels[~np.isnan(training_data).any(axis=1)]
+ y_indexes = is_train[~np.isnan(training_data).any(axis=1)]
+
+ roi_gr.close()
+
+ # shuffle the training data
+ if shuffle_data is True:
+ X, y, y_indexes = shuffle(X, y, y_indexes, random_state=random_state)
+
+ return(X, y, y_indexes)
+
+
+def prediction(clf, labels, predictors, scaler, class_probabilities,
+ rowincr, output, mode):
+
+ """
+ Prediction on list of GRASS rasters using a fitted scikit learn model
+
+ Parameters
+ ----------
+ clf: Scikit learn estimator object
+ labels: Numpy array of the labels used for the classification
+ predictors: List of GRASS rasters
+ scaler: Scaler for predictors, or None
+ output: Name of GRASS raster to output classification results
+ rowincr: Integer of raster rows to process at one time
+ class_probabilties: Predict class probabilities
+ mode: String, classification or regression mode
+
+ """
+
+ # create a list of rasterrow objects for predictors
+ n_features = len(predictors)
+ nclasses = len(labels)
+ rasstack = [0] * n_features
+
+ for i in range(n_features):
+ rasstack[i] = RasterRow(predictors[i])
+ if rasstack[i].exist() is True:
+ rasstack[i].open('r')
+ else:
+ grass.fatal("GRASS raster " + predictors[i] +
+ " does not exist.... exiting")
+
+ # use grass.pygrass.gis.region to get information about the current region
+ current = Region()
+
+ # create a imagery mask
+ # the input rasters might have different dimensions and non-value pixels.
+ # r.series used to automatically create a mask by propagating the nulls
+ grass.run_command("r.series", output='tmp_clfmask',
+ input=predictors, method='count', flags='n',
+ overwrite=True)
+
+ mask_raster = RasterRow('tmp_clfmask')
+ mask_raster.open('r')
+
+ # create and open RasterRow objects for classification
+ classification = RasterRow(output)
+ if mode == 'classification':
+ ftype = 'CELL'
+ nodata = -2147483648
+ else:
+ ftype = 'FCELL'
+ nodata = np.nan
+ classification.open('w', ftype, overwrite=True)
+
+ # create and open RasterRow objects for probabilities if enabled
+ if class_probabilities is True:
+ prob_out_raster = [0] * nclasses
+ prob = [0] * nclasses
+ for iclass in range(nclasses):
+ prob_out_raster[iclass] = output + \
+ '_classPr' + str(int(labels[iclass]))
+ prob[iclass] = RasterRow(prob_out_raster[iclass])
+ prob[iclass].open('w', 'FCELL', overwrite=True)
+
+ """
+ Prediction using row blocks
+ """
+
+ for rowblock in range(0, current.rows, rowincr):
+
+ # check that the row increment does not exceed the number of rows
+ if rowblock+rowincr > current.rows:
+ rowincr = current.rows - rowblock
+ img_np_row = np.zeros((rowincr, current.cols, n_features))
+ mask_np_row = np.zeros((rowincr, current.cols))
+
+ # loop through each row, and each band
+ # and add these values to the 2D array img_np_row
+ for row in range(rowblock, rowblock+rowincr, 1):
+ mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
+
+ for band in range(n_features):
+ img_np_row[row-rowblock, :, band] = \
+ np.array(rasstack[band][row])
+
+ mask_np_row[mask_np_row == -2147483648] = np.nan
+ nanmask = np.isnan(mask_np_row) # True in the mask means invalid data
+
+ # reshape each row-band matrix into a n*m array
+ nsamples = rowincr * current.cols
+ flat_pixels = img_np_row.reshape((nsamples, n_features))
+
+ # remove NaN values
+ flat_pixels = np.nan_to_num(flat_pixels)
+
+ # rescale
+ if scaler is not None:
+ flat_pixels = scaler.transform(flat_pixels)
+
+ # perform prediction
+ result = clf.predict(flat_pixels)
+ result = result.reshape((rowincr, current.cols))
+
+ # replace NaN values so that the prediction does not have a border
+ result = np.ma.masked_array(result, mask=nanmask, fill_value=-99999)
+
+ # return a copy of result, with masked values filled with a value
+ result = result.filled([nodata])
+
+ # for each row we can perform computation, and write the result into
+ for row in range(rowincr):
+ newrow = Buffer((result.shape[1],), mtype=ftype)
+ newrow[:] = result[row, :]
+ classification.put_row(newrow)
+
+ # same for probabilities
+ if class_probabilities is True and mode is 'classification':
+ result_proba = clf.predict_proba(flat_pixels)
+
+ for iclass in range(result_proba.shape[1]):
+
+ result_proba_class = result_proba[:, iclass]
+ result_proba_class = result_proba_class.reshape(
+ (rowincr, current.cols))
+
+ result_proba_class = np.ma.masked_array(
+ result_proba_class, mask=nanmask, fill_value=np.nan)
+
+ result_proba_class = result_proba_class.filled([np.nan])
+
+ for row in range(rowincr):
+
+ newrow = Buffer((
+ result_proba_class.shape[1],), mtype='FCELL')
+
+ newrow[:] = result_proba_class[row, :]
+ prob[iclass].put_row(newrow)
+
+ classification.close()
+ mask_raster.close()
+
+ if class_probabilities is True and mode is 'classification':
+ for iclass in range(nclasses):
+ prob[iclass].close()
+
+
+def cross_val_classification(clf, X, y, group_ids, cv, rstate):
+
+ """
+ Stratified Kfold cross-validation
+ Generates several scoring_metrics
+
+ Parameters
+ ----------
+ clf: Scikit learn estimator object
+ X: Numpy array containing predictor values
+ y: Numpy array containing labels
+ group_ids: Numpy array containing group labels for samples
+ cv: Integer of cross-validation folds
+ rstate: Seed to pass to the random number generator
+
+ Returns
+ -------
+ scores: Dictionary of global accuracy measures per fold
+ y_test_agg: Aggregated test responses
+ y_pred_agg: Aggregated predicted responses
+
+ """
+
+ # dicts of lists to store metrics
+ scores = {
+ 'accuracy': [],
+ 'auc': [],
+ 'r2': []
+ }
+
+ # generate Kfold indices
+ if group_ids is None:
+ k_fold = StratifiedKFold(
+ n_splits=cv, shuffle=False, random_state=rstate).split(X, y)
+ else:
+
+ n_groups = len(np.unique(group_ids))
+
+ if cv > n_groups:
+ grass.message(
+ "Number of cv folds cannot be larger than the number of groups in the zonal raster, using n_groups")
+ cv = n_groups
+ k_fold = GroupKFold(n_splits=cv).split(X, y, groups=group_ids)
+
+ y_test_agg = []
+ y_pred_agg = []
+
+ # loop through each fold
+ for train_indices, test_indices in k_fold:
+
+ X_train, X_test = X[train_indices], X[test_indices]
+ y_train, y_test = y[train_indices], y[test_indices]
+
+ # fit the model on the training data and predict the test data
+ fit = clf.fit(X_train, y_train)
+ y_pred = fit.predict(X_test)
+
+ y_test_agg = np.append(y_test_agg, y_test)
+ y_pred_agg = np.append(y_pred_agg, y_pred)
+
+ # calculate metrics
+ try:
+ scores['accuracy'] = np.append(
+ scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
+ except:
+ pass
+
+ scores['r2'] = np.append(
+ scores['r2'], metrics.r2_score(y_test, y_pred))
+
+ # test for if binary and classes equal 0,1
+ if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
+ try:
+ y_pred_proba = fit.predict_proba(X_test)[:, 1]
+
+ scores['auc'] = np.append(
+ scores['auc'], metrics.roc_auc_score(y_test, y_pred_proba))
+ except:
+ pass
+
+ return(scores, y_test_agg, y_pred_agg)
+
+
+def tune_split(X, y, Id, estimator, metric, params, test_size, random_state):
+
+ if Id is None:
+ X, X_devel, y, y_devel = train_test_split(X, y, test_size=test_size,
+ random_state=random_state)
+ Id_devel = None
+ else:
+ X, X_devel, y, y_devel, Id, Id_devel = train_test_split(X, y, Id, test_size=test_size,
+ random_state=random_state, stratify=Id)
+
+ clf = GridSearchCV(estimator=estimator, cv=3, param_grid=params,
+ scoring=metric, n_jobs=-1)
+
+ clf.fit(X_devel, y_devel)
+
+ return (X, X_devel, y, y_devel, Id, Id_devel, clf)
+
+
+def feature_importances(clf, X, y):
+
+ try:
+ clfimp = clf.feature_importances_
+ except:
+ sk = SelectKBest(f_classif, k='all')
+ sk_fit = sk.fit(X, y)
+ clfimp = sk_fit.scores_
+
+ return (clfimp)
+
+
+def sample_training_data(roi, maplist, cv, cvtype, model_load,
+ load_training, save_training, lowmem, random_state):
+
+ # load the model or training data
+ if model_load != '':
+ clf = joblib.load(model_load)
+ X, y, Id = load_training_data(
+ model_load.replace('.pkl', '.csv'))
+ else:
+ clf = None
+ if load_training != '':
+ X, y, Id = load_training_data(load_training)
+ else:
+ # create clumped roi for spatial cross validation
+ if cv > 1 and cvtype == 'clumped':
+ r.clump(input=roi, output='tmp_roi_clumped', overwrite=True, quiet=True)
+ maplist2 = copy.deepcopy(maplist)
+ maplist2.append('tmp_roi_clumped')
+ X, y, sample_coords = sample_predictors(response=roi,
+ predictors=maplist2,
+ shuffle_data=False,
+ lowmem=lowmem,
+ random_state=random_state)
+ # take Id from last column
+ Id = X[:, -1]
+
+ # remove Id column from predictors
+ X = X[:, 0:X.shape[1]-1]
+ else:
+ # query predictor rasters with training features
+ Id = None
+ X, y, sample_coords = sample_predictors(
+ response=roi, predictors=maplist, shuffle_data=True,
+ lowmem=lowmem, random_state=random_state)
+
+ # perform kmeans clustering on point coordinates
+ if cv > 1 and cvtype == 'kmeans':
+ clusters = KMeans(
+ n_clusters=cv, random_state=random_state, n_jobs=-1)
+ clusters.fit(sample_coords)
+ Id = clusters.labels_
+
+ if save_training != '':
+ save_training_data(X, y, Id, save_training)
+
+
+ return (X, y, Id, clf)
+
+
def main():
"""
More information about the grass-commit
mailing list