[GRASS-SVN] r70077 - grass-addons/grass7/raster/r.randomforest
svn_grass at osgeo.org
svn_grass at osgeo.org
Wed Dec 14 09:58:47 PST 2016
Author: spawley
Date: 2016-12-14 09:58:47 -0800 (Wed, 14 Dec 2016)
New Revision: 70077
Removed:
grass-addons/grass7/raster/r.randomforest/ml_utils.py
Log:
'r.randomforest refactoring'
Deleted: grass-addons/grass7/raster/r.randomforest/ml_utils.py
===================================================================
--- grass-addons/grass7/raster/r.randomforest/ml_utils.py 2016-12-14 17:58:33 UTC (rev 70076)
+++ grass-addons/grass7/raster/r.randomforest/ml_utils.py 2016-12-14 17:58:47 UTC (rev 70077)
@@ -1,488 +0,0 @@
-import numpy as np
-import grass.script as grass
-import tempfile
-import copy
-from grass.pygrass.raster import RasterRow
-from grass.pygrass.gis.region import Region
-from grass.pygrass.raster.buffer import Buffer
-from sklearn.model_selection import StratifiedKFold
-from sklearn import metrics
-from sklearn.model_selection import GroupKFold
-from sklearn.model_selection import train_test_split
-from sklearn.model_selection import GridSearchCV
-from sklearn.feature_selection import SelectKBest
-from sklearn.feature_selection import f_classif
-from sklearn.utils import shuffle
-
-from sklearn.externals import joblib
-from grass.pygrass.modules.shortcuts import raster as r
-from sklearn.cluster import KMeans
-
-
-def save_training_data(X, y, groups, file):
-
- """
- Saves any extracted training data to a csv file
-
- Parameters
- ----------
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels
- file: Path to a csv file to save data to
-
- """
-
- # if there are no group labels, create a nan filled array
- if groups is None:
- groups = np.empty((y.shape[0]))
- groups[:] = np.nan
-
- training_data = np.zeros((y.shape[0], X.shape[1]+2))
- training_data[:, 0:X.shape[1]] = X
- training_data[:, X.shape[1]] = y
- training_data[:, X.shape[1]+1] = groups
-
- np.savetxt(file, training_data, delimiter=',')
-
-
-def load_training_data(file):
-
- """
- Loads training data and labels from a csv file
-
- Parameters
- ----------
- file: Path to a csv file to save data to
-
- Returns
- -------
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- groups: Numpy array of group labels, or None
-
- """
-
- training_data = np.loadtxt(file, delimiter=',')
- n_features = training_data.shape[1]-1
-
- # check to see if last column contains group labels or nans
- groups = training_data[:, -1]
- training_data = training_data[:, 0:n_features]
-
- if np.isnan(groups).all() is True:
- # if all nans then ignore last column
- groups = None
-
- # fetch X and y
- X = training_data[:, 0:n_features-1]
- y = training_data[:, -1]
-
- return(X, y, groups)
-
-
-def sample_predictors(response, predictors, shuffle_data, lowmem, random_state):
-
- """
- Samples a list of GRASS rasters using a labelled raster
- Per raster sampling
-
- Parameters
- ----------
- response: String; GRASS raster with labelled pixels
- predictors: List of GRASS rasters containing explanatory variables
-
- Returns
- -------
-
- training_data: Numpy array of extracted raster values
- training_labels: Numpy array of labels
- y_indexes: Row and Columns of label positions
-
- """
- current = Region()
-
- # open response raster as rasterrow and read as np array
- if RasterRow(response).exist() is True:
- roi_gr = RasterRow(response)
- roi_gr.open('r')
-
- if lowmem is False:
- response_np = np.array(roi_gr)
- else:
- response_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
- response_np[:] = np.array(roi_gr)[:]
- else:
- grass.fatal("GRASS response raster does not exist.... exiting")
-
- # determine number of predictor rasters
- n_features = len(predictors)
-
- # check to see if all predictors exist
- for i in range(n_features):
- if RasterRow(predictors[i]).exist() is not True:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # check if any of those pixels are labelled (not equal to nodata)
- # can use even if roi is FCELL because nodata will be nan
- is_train = np.nonzero(response_np > -2147483648)
- training_labels = response_np[is_train]
- n_labels = np.array(is_train).shape[1]
-
- # Create a zero numpy array of len training labels
- if lowmem is False:
- training_data = np.zeros((n_labels, n_features))
- else:
- training_data = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(n_labels, n_features))
-
- # Loop through each raster and sample pixel values at training indexes
- if lowmem is True:
- feature_np = np.memmap(tempfile.NamedTemporaryFile(),
- dtype='float32', mode='w+',
- shape=(current.rows, current.cols))
-
- for f in range(n_features):
- predictor_gr = RasterRow(predictors[f])
- predictor_gr.open('r')
-
- if lowmem is False:
- feature_np = np.array(predictor_gr)
- else:
- feature_np[:] = np.array(predictor_gr)[:]
-
- training_data[0:n_labels, f] = feature_np[is_train]
- predictor_gr.close()
-
- # convert any CELL maps no datavals to NaN in the training data
- for i in range(n_features):
- training_data[training_data[:, i] == -2147483648] = np.nan
-
- # convert indexes of training pixels from tuple to n*2 np array
- is_train = np.array(is_train).T
-
- # Remove nan rows from training data
- X = training_data[~np.isnan(training_data).any(axis=1)]
- y = training_labels[~np.isnan(training_data).any(axis=1)]
- y_indexes = is_train[~np.isnan(training_data).any(axis=1)]
-
- roi_gr.close()
-
- # shuffle the training data
- if shuffle_data is True:
- X, y, y_indexes = shuffle(X, y, y_indexes, random_state=random_state)
-
- return(X, y, y_indexes)
-
-
-def prediction(clf, labels, predictors, scaler, class_probabilities,
- rowincr, output, mode):
-
- """
- Prediction on list of GRASS rasters using a fitted scikit learn model
-
- Parameters
- ----------
- clf: Scikit learn estimator object
- labels: Numpy array of the labels used for the classification
- predictors: List of GRASS rasters
- scaler: Scaler for predictors, or None
- output: Name of GRASS raster to output classification results
- rowincr: Integer of raster rows to process at one time
- class_probabilties: Predict class probabilities
- mode: String, classification or regression mode
-
- """
-
- # create a list of rasterrow objects for predictors
- n_features = len(predictors)
- nclasses = len(labels)
- rasstack = [0] * n_features
-
- for i in range(n_features):
- rasstack[i] = RasterRow(predictors[i])
- if rasstack[i].exist() is True:
- rasstack[i].open('r')
- else:
- grass.fatal("GRASS raster " + predictors[i] +
- " does not exist.... exiting")
-
- # use grass.pygrass.gis.region to get information about the current region
- current = Region()
-
- # create a imagery mask
- # the input rasters might have different dimensions and non-value pixels.
- # r.series used to automatically create a mask by propagating the nulls
- grass.run_command("r.series", output='tmp_clfmask',
- input=predictors, method='count', flags='n',
- overwrite=True)
-
- mask_raster = RasterRow('tmp_clfmask')
- mask_raster.open('r')
-
- # create and open RasterRow objects for classification
- classification = RasterRow(output)
- if mode == 'classification':
- ftype = 'CELL'
- nodata = -2147483648
- else:
- ftype = 'FCELL'
- nodata = np.nan
- classification.open('w', ftype, overwrite=True)
-
- # create and open RasterRow objects for probabilities if enabled
- if class_probabilities is True:
- prob_out_raster = [0] * nclasses
- prob = [0] * nclasses
- for iclass in range(nclasses):
- prob_out_raster[iclass] = output + \
- '_classPr' + str(int(labels[iclass]))
- prob[iclass] = RasterRow(prob_out_raster[iclass])
- prob[iclass].open('w', 'FCELL', overwrite=True)
-
- """
- Prediction using row blocks
- """
-
- for rowblock in range(0, current.rows, rowincr):
-
- # check that the row increment does not exceed the number of rows
- if rowblock+rowincr > current.rows:
- rowincr = current.rows - rowblock
- img_np_row = np.zeros((rowincr, current.cols, n_features))
- mask_np_row = np.zeros((rowincr, current.cols))
-
- # loop through each row, and each band
- # and add these values to the 2D array img_np_row
- for row in range(rowblock, rowblock+rowincr, 1):
- mask_np_row[row-rowblock, :] = np.array(mask_raster[row])
-
- for band in range(n_features):
- img_np_row[row-rowblock, :, band] = \
- np.array(rasstack[band][row])
-
- mask_np_row[mask_np_row == -2147483648] = np.nan
- nanmask = np.isnan(mask_np_row) # True in the mask means invalid data
-
- # reshape each row-band matrix into a n*m array
- nsamples = rowincr * current.cols
- flat_pixels = img_np_row.reshape((nsamples, n_features))
-
- # remove NaN values
- flat_pixels = np.nan_to_num(flat_pixels)
-
- # rescale
- if scaler is not None:
- flat_pixels = scaler.transform(flat_pixels)
-
- # perform prediction
- result = clf.predict(flat_pixels)
- result = result.reshape((rowincr, current.cols))
-
- # replace NaN values so that the prediction does not have a border
- result = np.ma.masked_array(result, mask=nanmask, fill_value=-99999)
-
- # return a copy of result, with masked values filled with a value
- result = result.filled([nodata])
-
- # for each row we can perform computation, and write the result into
- for row in range(rowincr):
- newrow = Buffer((result.shape[1],), mtype=ftype)
- newrow[:] = result[row, :]
- classification.put_row(newrow)
-
- # same for probabilities
- if class_probabilities is True and mode is 'classification':
- result_proba = clf.predict_proba(flat_pixels)
-
- for iclass in range(result_proba.shape[1]):
-
- result_proba_class = result_proba[:, iclass]
- result_proba_class = result_proba_class.reshape(
- (rowincr, current.cols))
-
- result_proba_class = np.ma.masked_array(
- result_proba_class, mask=nanmask, fill_value=np.nan)
-
- result_proba_class = result_proba_class.filled([np.nan])
-
- for row in range(rowincr):
-
- newrow = Buffer((
- result_proba_class.shape[1],), mtype='FCELL')
-
- newrow[:] = result_proba_class[row, :]
- prob[iclass].put_row(newrow)
-
- classification.close()
- mask_raster.close()
-
- if class_probabilities is True and mode is 'classification':
- for iclass in range(nclasses):
- prob[iclass].close()
-
-
-def cross_val_classification(clf, X, y, group_ids, cv, rstate):
-
- """
- Stratified Kfold cross-validation
- Generates several scoring_metrics
-
- Parameters
- ----------
- clf: Scikit learn estimator object
- X: Numpy array containing predictor values
- y: Numpy array containing labels
- group_ids: Numpy array containing group labels for samples
- cv: Integer of cross-validation folds
- rstate: Seed to pass to the random number generator
-
- Returns
- -------
- scores: Dictionary of global accuracy measures per fold
- y_test_agg: Aggregated test responses
- y_pred_agg: Aggregated predicted responses
-
- """
-
- # dicts of lists to store metrics
- scores = {
- 'accuracy': [],
- 'auc': [],
- 'r2': []
- }
-
- # generate Kfold indices
- if group_ids is None:
- k_fold = StratifiedKFold(
- n_splits=cv, shuffle=False, random_state=rstate).split(X, y)
- else:
-
- n_groups = len(np.unique(group_ids))
-
- if cv > n_groups:
- grass.message(
- "Number of cv folds cannot be larger than the number of groups in the zonal raster, using n_groups")
- cv = n_groups
- k_fold = GroupKFold(n_splits=cv).split(X, y, groups=group_ids)
-
- y_test_agg = []
- y_pred_agg = []
-
- # loop through each fold
- for train_indices, test_indices in k_fold:
-
- X_train, X_test = X[train_indices], X[test_indices]
- y_train, y_test = y[train_indices], y[test_indices]
-
- # fit the model on the training data and predict the test data
- fit = clf.fit(X_train, y_train)
- y_pred = fit.predict(X_test)
-
- y_test_agg = np.append(y_test_agg, y_test)
- y_pred_agg = np.append(y_pred_agg, y_pred)
-
- # calculate metrics
- try:
- scores['accuracy'] = np.append(
- scores['accuracy'], metrics.accuracy_score(y_test, y_pred))
- except:
- pass
-
- scores['r2'] = np.append(
- scores['r2'], metrics.r2_score(y_test, y_pred))
-
- # test for if binary and classes equal 0,1
- if len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
- try:
- y_pred_proba = fit.predict_proba(X_test)[:, 1]
-
- scores['auc'] = np.append(
- scores['auc'], metrics.roc_auc_score(y_test, y_pred_proba))
- except:
- pass
-
- return(scores, y_test_agg, y_pred_agg)
-
-
-def tune_split(X, y, Id, estimator, metric, params, test_size, random_state):
-
- if Id is None:
- X, X_devel, y, y_devel = train_test_split(X, y, test_size=test_size,
- random_state=random_state)
- Id_devel = None
- else:
- X, X_devel, y, y_devel, Id, Id_devel = train_test_split(X, y, Id, test_size=test_size,
- random_state=random_state, stratify=Id)
-
- clf = GridSearchCV(estimator=estimator, cv=3, param_grid=params,
- scoring=metric, n_jobs=-1)
-
- clf.fit(X_devel, y_devel)
-
- return (X, X_devel, y, y_devel, Id, Id_devel, clf)
-
-
-def feature_importances(clf, X, y):
-
- try:
- clfimp = clf.feature_importances_
- except:
- sk = SelectKBest(f_classif, k='all')
- sk_fit = sk.fit(X, y)
- clfimp = sk_fit.scores_
-
- return (clfimp)
-
-
-def sample_training_data(roi, maplist, cv, cvtype, model_load,
- load_training, save_training, lowmem, random_state):
-
- # load the model or training data
- if model_load != '':
- clf = joblib.load(model_load)
- X, y, Id = load_training_data(
- model_load.replace('.pkl', '.csv'))
- else:
- clf = None
- if load_training != '':
- X, y, Id = load_training_data(load_training)
- else:
- # create clumped roi for spatial cross validation
- if cv > 1 and cvtype == 'clumped':
- r.clump(input=roi, output='tmp_roi_clumped', overwrite=True, quiet=True)
- maplist2 = copy.deepcopy(maplist)
- maplist2.append('tmp_roi_clumped')
- X, y, sample_coords = sample_predictors(response=roi,
- predictors=maplist2,
- shuffle_data=False,
- lowmem=lowmem,
- random_state=random_state)
- # take Id from last column
- Id = X[:, -1]
-
- # remove Id column from predictors
- X = X[:, 0:X.shape[1]-1]
- else:
- # query predictor rasters with training features
- Id = None
- X, y, sample_coords = sample_predictors(
- response=roi, predictors=maplist, shuffle_data=True,
- lowmem=lowmem, random_state=random_state)
-
- # perform kmeans clustering on point coordinates
- if cv > 1 and cvtype == 'kmeans':
- clusters = KMeans(
- n_clusters=cv, random_state=random_state, n_jobs=-1)
- clusters.fit(sample_coords)
- Id = clusters.labels_
-
- if save_training != '':
- save_training_data(X, y, Id, save_training)
-
-
- return (X, y, Id, clf)
More information about the grass-commit
mailing list