import copy
import math
import pickle
import keras_tuner as kt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import (
ConfusionMatrixDisplay,
accuracy_score,
confusion_matrix,
f1_score,
mean_absolute_error,
mean_absolute_percentage_error,
mean_squared_error,
precision_score,
r2_score,
recall_score,
)
from tqdm.auto import tqdm
import tensorflow as tf
import pyMAISE.settings as settings
from pyMAISE.tuner import Tuner
from pyMAISE.utils import _try_clear
from pyMAISE.utils.trial import determine_class_from_probabilities
[docs]
class PostProcessor:
"""
Assess the performance of the top-performing models.
Parameters
----------
data: tuple of xarray.DataArray
The training and testing data given as ``(xtrain, xtest, ytrain, ytest)``.
model_configs: single or list of dict of tuple(pandas.DataFrame, model object)
The model configurations produced by :class:`pyMAISE.Tuner`.
new_model_settings: dict of dict of int, float, str, or None, default=None
Updated model settings given as a dictionary under the model's key.
yscaler: callable or None, default=None
An object with an ``inverse_transform`` method such as
`min-max scaler from sklearn <https://scikit-learn.org/stable/\
modules/generated/sklearn.preprocessing.MinMaxScaler.html>`_
:cite:`scikit-learn`. This should have been fit using
:meth:`pyMAISE.preprocessing.scale_data` before hyperparameter
tuning. If ``None`` then scaling is not undone.
"""
[docs]
def __init__(
self,
data,
model_configs,
new_model_settings=None,
yscaler=None,
):
# Extract data
self._xtrain, self._xtest, self._ytrain, self._ytest = data
# Initialize lists
model_types = []
params = []
model_wrappers = []
# Convert to list if only one is given
if isinstance(model_configs, dict):
model_configs = [model_configs]
# Extract models and the DataFrame of hyperparameter configurations
for models in model_configs:
for model, configs in models.items():
# Fill model types list with string of model type
model_types = model_types + [model] * len(configs[0]["params"])
# Fil parameter string with hyperparameter configurations for each type
params = params + configs[0]["params"].tolist()
# Get all model wrappers and update parameter configurations if needed
estimator = configs[1]
if new_model_settings is not None and model in new_model_settings:
if (
model in Tuner.supported_classical_models
or not settings.values.new_nn_architecture
):
estimator = estimator.set_params(**new_model_settings[model])
else:
estimator.set_params(new_model_settings[model])
model_wrappers = model_wrappers + [estimator] * len(
configs[0]["params"]
)
# Create models DataFrame
self._models = pd.DataFrame(
{
"Model Types": model_types,
"Parameter Configurations": params,
"Model Wrappers": model_wrappers,
}
)
# If we are doing Gaussian Processing, check for standard scaling
if "GP" in model_types:
self.__check_if_standard_scaled(self._xtrain)
self.__check_if_standard_scaled(self._xtest)
# Fit each model to training data and get predicted training
# and testing from each model
yhat_train, yhat_test, histories = self._fit()
# Scale predicted data if scaler is given
self._yscaler = yscaler
if self._yscaler is not None:
for i in range(len(yhat_train)):
yhat_train[i] = self._yscaler.inverse_transform(yhat_train[i])
yhat_test[i] = self._yscaler.inverse_transform(yhat_test[i])
# Create pandas.DataFrame
self._models = pd.concat(
[
self._models,
pd.DataFrame(
{
"Train Yhat": yhat_train,
"Test Yhat": yhat_test,
"History": histories,
}
),
],
axis=1,
)
_try_clear()
# ===========================================================
# Methods
def __check_if_standard_scaled(self, data, name=""):
"""Check if the data is standardized (mean ~ 0, std ~ 1)."""
mean = np.mean(data, axis=0)
std = np.std(data, axis=0)
mean_check = np.allclose(mean, 0, atol=1e-6) # Check if mean is close to 0
std_check = np.allclose(std, 1, atol=1e-6) # Check if std is close to 1
if not mean_check or not std_check:
raise ValueError(
f"{name} data is not standard scaled: " f"mean = {mean}, std = {std}"
)
def _fit(self):
"""Fit all models with training data and predict both training and testing
data."""
# Array for trainig and testing prediceted outcomes
yhat_train = []
yhat_test = []
histories = []
# Progress bar
p = tqdm(
range(self._models.shape[0]),
)
# Fit each model and predict outcomes
for i in range(self._models.shape[0]):
p.desc = self._models["Model Types"][i]
p.n += 1
p.refresh()
# Extract regressor for the configuration
regressor = None
if (
self._models["Model Types"][i] in Tuner.supported_classical_models
or not settings.values.new_nn_architecture
):
regressor = self._models["Model Wrappers"][i].set_params(
**self._models["Parameter Configurations"][i]
)
else:
regressor = self._models["Model Wrappers"][i].build(
self._models["Parameter Configurations"][i]
)
# Append learning curve history of neural networks and run fit for all
if self._models["Model Types"][i] in Tuner.supported_classical_models:
# Change final dimension if there is only one feature
# in any of these arrays
xtrain = (
self._xtrain
if self._xtrain.shape[-1] > 1
else self._xtrain.isel(**{self._xtrain.dims[-1]: 0})
)
ytrain = (
self._ytrain
if self._ytrain.shape[-1] > 1
else self._ytrain.isel(**{self._ytrain.dims[-1]: 0})
)
regressor.fit(xtrain.values, ytrain.values)
histories.append(None)
else:
if not settings.values.new_nn_architecture:
histories.append(
regressor.fit(
self._xtrain.values,
self._ytrain.values,
).model.history.history
)
else:
histories.append(
self._models["Model Wrappers"][i]
.fit(
self._models["Parameter Configurations"][i],
regressor,
self._xtrain.values,
self._ytrain.values,
)
.model.history.history
)
if settings.values.problem_type == settings.ProblemType.REGRESSION:
# Append training and testing predictions
yhat_train.append(
regressor.predict(
self._xtrain, verbose=settings.values.verbosity
).reshape(-1, self._ytrain.shape[-1])
)
yhat_test.append(
regressor.predict(
self._xtest, verbose=settings.values.verbosity
).reshape(-1, self._ytest.shape[-1])
)
continue
else:
# Append training and testing predictions
yhat_train.append(
determine_class_from_probabilities(
regressor.predict(
self._xtrain.values,
verbose=settings.values.verbosity,
),
self._ytrain.values,
).reshape(-1, self._ytrain.shape[-1])
)
yhat_test.append(
determine_class_from_probabilities(
regressor.predict(
self._xtest.values,
verbose=settings.values.verbosity,
),
self._ytest.values,
).reshape(-1, self._ytest.shape[-1])
)
continue
# Append training and testing predictions
yhat_train.append(
regressor.predict(self._xtrain).reshape(-1, self._ytrain.shape[-1])
)
yhat_test.append(
regressor.predict(self._xtest).reshape(-1, self._ytest.shape[-1])
)
return (yhat_train, yhat_test, histories)
[docs]
def metrics(
self, y=None, model_type=None, metrics=None, sort_by=None, direction=None
):
"""
Calculate model performance of predicting output training and testing data.
Default metrics are always evaluated depending on the
:attr:`pyMAISE.Settings.problem_type`. For
:attr:`pyMAISE.ProblemType.REGRESSION` problems, the default metrics are from
:cite:`scikit-learn` and include:
- ``R2``: `r-squared <https://scikit-learn.org/stable/modules/generated/\
sklearn.metrics.r2_score.html#sklearn.metrics.r2_score>`_,
- ``MAE``: `mean absolute error <https://scikit-learn.org/\
stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn\
.metrics.mean_absolute_error>`_,
- ``MAPE``: `mean absolute percentage error <https://scikit-learn\
.org/stable/modules/generated/sklearn.metrics.mean_absolute_per\
centage_error.html>`_,
- ``RMSE``: root mean squared error, the square
root of ``MSE``,
- ``RMSPE``: root mean squared percentage error.
For :attr:`pyMAISE.ProblemType.CLASSIFICATION` problems, the default metrics are
- ``Accuracy``: `accuracy <https://scikit-learn.org/stable/modules/\
generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_\
score>`_,
- ``Recall``: `recall <https://scikit-learn.org/stable/modules/generated/\
sklearn.metrics.recall_score.html#sklearn.metrics.recall_score>`_,
- ``Precision``: `precision <https://scikit-learn.org/stable/modules/\
generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_\
score>`_,
- ``F1``: `f1 <https://scikit-learn.org/stable/modules/generated/sklearn.\
metrics.f1_score.html#sklearn.metrics.f1_score>`_.
These metrics are evaluated for both the training and testing data sets.
Parameters
----------
y: int, str, or None, default=None
The output to determine performance. If ``None`` then all outputs
are used.
model_type: str or None, default=None
Determine the performance of this model. If ``None`` then all models are
evaluated.
metrics: dict of callable or None, default=None
Dictionary of callable metrics such as `sklearn's metrics <https://scikit-\
learn.org/stable/modules/model_evaluation.html>`_ other than those already
default to this method. Must take two arguments: ``(y_true, y_pred)``. The
key is used as the name in ``performance_data``.
sort_by: str or None, default=None
The metric to sort the return by. This should differentiate training
and testing. For example, we can sort by ``testing mean_squared_error``.
If ``None`` then the default is ``test r2_score`` for
:attr:`pyMAISE.ProblemType.REGRESSION` and ``test accuracy_score``
for :attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: `min`, `max`, or None
Direction to ``sort_by``. Only required if a metric is defined in
``metrics`` that you want to sort the return by.
Returns
-------
performance_data: pandas.DataFrame
The performance statistics for the models for both the training and testing
data.
"""
# Define root mean squared error metric
def root_mean_squared_error(y_true, y_pred):
return math.sqrt(mean_squared_error(y_true, y_pred))
def mai_recall_score(y_true, y_pred):
return recall_score(y_true, y_pred, average="micro")
def mai_precision_score(y_true, y_pred):
return precision_score(y_true, y_pred, average="micro")
def mai_f1_score(y_true, y_pred):
return f1_score(y_true, y_pred, average="micro")
def mai_mean_absolute_percentage_error(y_true, y_pred):
return mean_absolute_percentage_error(y_true, y_pred) * 100
def root_mean_square_percentage_error(y_true, y_pred):
epsilon = np.finfo(np.float64).eps
return 100 * np.mean(
np.sqrt(
np.mean(
np.square(
(y_true - y_pred) / np.maximum(np.abs(y_true), epsilon)
),
axis=0,
)
)
)
# Get the list of y if not provided
num_outputs = self._ytrain.shape[-1]
if y is None:
y = slice(0, num_outputs + 1)
elif isinstance(y, str):
y = np.where(self._ytrain.coords[self._ytrain.dims[-1]].to_numpy() == y)[0]
# Scale training and testing output
y_true = {
"Train": (
self._ytrain.values.reshape(-1, num_outputs)[:, y]
if self._yscaler is None
else self._yscaler.inverse_transform(
self._ytrain.values.reshape(-1, num_outputs)
)[:, y]
),
"Test": (
self._ytest.values.reshape(-1, num_outputs)
if self._yscaler is None
else self._yscaler.inverse_transform(
self._ytest.values.reshape(-1, num_outputs)
)[:, y]
),
}
# Get all metrics functions
metrics = metrics if metrics is not None else {}
if settings.values.problem_type == settings.ProblemType.REGRESSION:
metrics = {
"R2": r2_score,
"MAE": mean_absolute_error,
"MAPE": mai_mean_absolute_percentage_error,
"RMSE": root_mean_squared_error,
"RMSPE": root_mean_square_percentage_error,
**metrics,
}
if settings.values.problem_type == settings.ProblemType.CLASSIFICATION:
metrics = {
"Accuracy": accuracy_score,
"Recall": mai_recall_score,
"Precision": mai_precision_score,
"F1": mai_f1_score,
**metrics,
}
evaluated_metrics = {
**{f"Train {metric}": [] for metric in metrics},
**{f"Test {metric}": [] for metric in metrics},
}
for i in range(self._models.shape[0]):
for split in ["Train", "Test"]:
# Get predicted data
y_pred = self._models[f"{split} Yhat"][i].reshape(-1, num_outputs)[:, y]
# Evaluate metrics
for metric_name, func in metrics.items():
evaluated_metrics[f"{split} {metric_name}"].append(
func(y_true[split], y_pred)
)
# Determine sort_by depending on problem
sort_by = f"Test {next(iter(metrics))}" if sort_by is None else sort_by
ascending = (
sort_by
not in (
"Train R2",
"Test R2",
"Train Accuracy",
"Test Accuracy",
)
or direction == "min"
)
# Place metrics into models DataFrame
for key, value in evaluated_metrics.items():
self._models[key] = value
models = copy.deepcopy(
self._models[
["Model Types", "Parameter Configurations"]
+ list(evaluated_metrics.keys())
]
)
hyperparams = []
for i in range(models.shape[0]):
if isinstance(models["Parameter Configurations"][i], kt.HyperParameters):
hyperparams.append(models["Parameter Configurations"][i].values)
else:
hyperparams.append(models["Parameter Configurations"][i])
models["Parameter Configurations"] = hyperparams
if model_type is None:
return models.sort_values(sort_by, ascending=[ascending])
else:
return models[models["Model Types"] == model_type].sort_values(
sort_by, ascending=[ascending]
)
def _get_idx(
self, idx=None, model_type=None, sort_by=None, direction=None, nns_only=False
):
"""Get index of model in ``pandas.DataFrame`` based on model type and sort_by
condition."""
filter = self._models["Model Types"].unique()
if model_type is not None:
if not self._models["Model Types"].str.contains(model_type).any():
raise RuntimeError(
f"Model {model_type} was not given to {PostProcessor.__name__}"
)
if nns_only:
filter = set(filter) - set(Tuner.supported_classical_models.keys())
# Determine sort_by depending on problem
if sort_by is None:
if settings.values.problem_type == settings.ProblemType.REGRESSION:
sort_by = "Test R2"
if settings.values.problem_type == settings.ProblemType.CLASSIFICATION:
sort_by = "Test Accuracy"
# Determine the index of the model in the DataFrame
if idx is None:
if model_type is not None:
# If an index is not given but a model type is, get index
# based on sort_by
if (
sort_by
in (
"Train R2",
"Test R2",
"Train Accuracy",
"Test Accuracy",
)
or not direction == "min"
):
idx = self._models[self._models["Model Types"] == model_type][
sort_by
].idxmax()
else:
idx = self._models[self._models["Model Types"] == model_type][
sort_by
].idxmin()
else:
# If an index is not given and the model type is not given,
# return index of best in sort_by
if (
sort_by
in (
"Train R2",
"Test R2",
"Train Accuracy",
"Test Accuracy",
)
or not direction == "min"
):
idx = self._models[self._models["Model Types"].isin(filter)][
sort_by
].idxmax()
else:
idx = self._models[self._models["Model Types"].isin(filter)][
sort_by
].idxmin()
return idx
[docs]
def save_models(
self,
num_models=10,
idxs=None,
model_types=None,
sort_by=None,
direction=None,
directory=".",
):
"""
Saves the top models. Models are names as ``<Model Type>_<Index in to metrics
table>``.
Parameters
----------
num_models: int, default=None
Number of models to save.
idxs: int, list of ints, None, default=None
The indices in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_types: str, list of str, or None, default=None
The model name(s) to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
directory: str, default="."
Directory to save the models to. All sklearn models will be saved as
pickles and the keras models will be in TensorFlow's SavedModel
format.
"""
# Get indices
if idxs:
if isinstance(idxs, int):
idxs = [idxs]
elif not isinstance(idxs, list) and not isinstance(idxs, np.ndarray):
raise TypeError(
"idxs must be an int, list of ints, np.ndarray of ints, or None"
)
# Get model_types
if model_types:
if isinstance(model_types, str):
model_types = [model_types]
elif not isinstance(model_types, list) and not isinstance(
model_types, np.ndarray
):
raise TypeError(
"model_types must be a string, list of strings, "
+ "np.ndarray of strings, or None"
)
# Get sort if not given and ascending
ascending = True if direction == "min" else False
if sort_by is None:
sort_by = (
"Test R2"
if settings.values.problem_type == settings.ProblemType.REGRESSION
else "Test Accuracy"
)
ascending = False
# Get indices if not given
if idxs is None:
# Sort models
sorted_models = self._models[
["Model Types", "Parameter Configurations", "Model Wrappers", sort_by]
].sort_values(sort_by, ascending=ascending)
# Filter models by model type
if model_types:
sorted_models = sorted_models[
sorted_models["Model Types"].isin(set(model_types))
]
idxs = sorted_models.index.values[:num_models]
# Iterate through idxs, train the models, and save them
p = (
tqdm(
range(len(idxs)),
desc=f"{self._models['Model Types'][idxs[0]]}_{idxs[0]}",
)
if settings.values.verbosity == 0
else None
)
for idx in idxs:
if p:
p.desc = f"{self._models['Model Types'][idx]}_{idx}"
p.refresh()
# Train model
model = self.get_model(idx=idx)
# Save model
if isinstance(model, tf.keras.models.Sequential):
model.save(f"{directory}/{self._models['Model Types'][idx]}_{idx}")
else:
pickle.dump(
model,
open(
f"{directory}/{self._models['Model Types'][idx]}_{idx}.pkl",
"wb",
),
)
if p:
p.n += 1
p.refresh()
if p:
_try_clear()
[docs]
def get_predictions(self, idx=None, model_type=None, sort_by=None, direction=None):
"""
Get a model's training and testing predictions.
Parameters
----------
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
Returns
-------
yhat: tuple of numpy.array
The predicted training and testing data given as
``(train_yhat, test_yhat)``.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx, model_type=model_type, sort_by=sort_by, direction=direction
)
return (self._models["Train Yhat"][idx], self._models["Test Yhat"][idx])
[docs]
def get_params(self, idx=None, model_type=None, sort_by=None, direction=None):
"""
Returns the hyperparameters for a given model.
Parameters
----------
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
Returns
-------
params: pandas.DataFrame
The hyperparameters of the model.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx, model_type=model_type, sort_by=sort_by, direction=direction
)
# Get values from pyMAISE.HyperParameters
parameters = copy.deepcopy(self._models["Parameter Configurations"][idx])
if (
self._models["Model Types"][idx] not in Tuner.supported_classical_models
and settings.values.new_nn_architecture
):
parameters = parameters.values
model_type = self._models["Model Types"][idx]
return pd.DataFrame({"Model Types": [model_type], **parameters})
[docs]
def get_model(self, idx=None, model_type=None, sort_by=None, direction=None):
"""
Get a model. The model with the chosen hyperparameters is refit and returned.
Parameters
----------
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
Returns
-------
model: sklearn or keras model
The model refit based on the parameters from the arguments.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx, model_type=model_type, sort_by=sort_by, direction=direction
)
# Get regressor and fit the model
regressor = None
if (
self._models["Model Types"][idx] in Tuner.supported_classical_models
or not settings.values.new_nn_architecture
):
xtrain = (
self._xtrain
if self._xtrain.shape[-1] > 1
else self._xtrain.isel(**{self._xtrain.dims[-1]: 0})
)
ytrain = (
self._ytrain
if self._ytrain.shape[-1] > 1
else self._ytrain.isel(**{self._ytrain.dims[-1]: 0})
)
regressor = (
self._models["Model Wrappers"][idx]
.set_params(**self._models["Parameter Configurations"][idx])
.fit(xtrain, ytrain)
)
else:
regressor = self._models["Model Wrappers"][idx].build(
self._models["Parameter Configurations"][idx]
)
regressor._name = self._models["Model Types"][idx]
self._models["Model Wrappers"][idx].fit(
self._models["Parameter Configurations"][idx],
regressor,
self._xtrain.values,
self._ytrain.values,
)
return regressor
[docs]
def diagonal_validation_plot(
self, ax=None, y=None, idx=None, model_type=None, sort_by=None, direction=None
):
"""
Create a diagonal validation plot for a given model.
Parameters
----------
ax: matplotlib.pyplot.axis or None, default=None
If not given, then an axis is created.
y: single or list of int or str or None, default=None
The output to plot. If ``None`` then all outputs are plotted.
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
Returns
-------
ax: matplotlib.pyplot.axis
The plot.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx, model_type=model_type, sort_by=sort_by, direction=direction
)
# Get the list of y if not provided
if not isinstance(y, list):
y = [y] if y is not None else list(range(self._ytrain.shape[-1]))
if ax is None:
ax = plt.gca()
ytrain = self._ytrain.values
ytest = self._ytest.values
if self._yscaler is not None:
ytrain = self._yscaler.inverse_transform(
ytrain.reshape(-1, ytrain.shape[-1])
)
ytest = self._yscaler.inverse_transform(ytest.reshape(-1, ytest.shape[-1]))
for y_idx in y:
if isinstance(y_idx, str):
y_idx = np.where(
self._ytrain.coords[self._ytrain.dims[-1]].to_numpy() == y_idx
)[0]
ax.scatter(
self._models["Train Yhat"][idx][..., y_idx],
ytrain[..., y_idx],
c="b",
marker="o",
)
ax.scatter(
self._models["Test Yhat"][idx][..., y_idx],
ytest[..., y_idx],
c="r",
marker="o",
)
lims = [
np.min([ax.get_xlim(), ax.get_ylim()]),
np.max([ax.get_xlim(), ax.get_ylim()]),
]
ax.plot(lims, lims, "k--")
ax.set_aspect("equal")
ax.set_xlim(lims)
ax.set_ylim(lims)
ax.legend(["Training Data", "Testing Data"])
ax.set_xlabel("Predicted Outcome")
ax.set_ylabel("Actual Outcome")
return ax
[docs]
def validation_plot(
self,
ax=None,
y=None,
idx=None,
model_type=None,
sort_by=None,
direction=None,
):
"""
Create a validation plot for a given model.
Parameters
----------
ax: matplotlib.pyplot.axis or None, default=None
If not given, then an axis is created.
y: single or list of int or str or None, default=None
The output to plot. If ``None`` then all outputs are plotted.
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
Returns
-------
ax: matplotlib.pyplot.axis
The plot.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx, model_type=model_type, sort_by=sort_by, direction=direction
)
# Get the list of y if not provided
if not isinstance(y, list):
y = [y] if y is not None else list(range(self._ytrain.shape[-1]))
# Get prediected and actual outputs
ytest = self._ytest.values
yhat_test = self._models["Test Yhat"][idx]
if self._yscaler is not None:
ytest = self._yscaler.inverse_transform(ytest.reshape(-1, ytest.shape[-1]))
if ax is None:
ax = plt.gca()
for y_idx in y:
# If the column name is given as opposed to the position,
# find the position
if isinstance(y_idx, str):
y_idx = np.where(
self._ytest.coords[self._ytest.dims[-1]].values == y_idx
)[0]
ax.scatter(
np.linspace(1, ytest.shape[0], ytest.shape[0]),
np.abs((ytest[:, y_idx] - yhat_test[:, y_idx]) / ytest[:, y_idx]) * 100,
label=self._ytest.coords[self._ytest.dims[-1]].values[y_idx],
)
if len(y) > 1:
ax.legend()
ax.set_xlabel("Testing Data Index")
ax.set_ylabel("Absolute Relative Error (%)")
return ax
[docs]
def nn_learning_plot(
self, ax=None, idx=None, model_type=None, sort_by=None, direction=None
):
"""
Create a learning plot for a given neural network.
Parameters
----------
ax: matplotlib.pyplot.axis or None, default=None
If not given then an axis is created.
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
Returns
-------
ax: matplotlib.pyplot.axis
The plot.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx,
model_type=model_type,
sort_by=sort_by,
direction=direction,
nns_only=True,
)
ax = ax or plt.gca()
history = self._models["History"][idx]
ax.plot(history["loss"], label="Training")
ax.plot(history["val_loss"], label="Validation")
ax.legend()
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
return ax
[docs]
def print_model(
self, idx=None, model_type=None, sort_by=None, direction=None, **kwargs
):
"""
Print a models tuned hyperparameters.
Parameters
----------
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
kwargs:
Any arguments used by `tensorflow.keras.Sequential.summary()
<https://www.tensorflow.org/api_docs/python/tf/keras/Sequential#summ\
ary>`_.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx,
model_type=model_type,
sort_by=sort_by,
direction=direction,
nns_only=False,
)
# Get model parameters
params = self.get_params(idx=idx).to_dict()
# Print parameters if not NN else ensure only pertinent information is printed
print(f"Model Type: {params.pop('Model Types')[0]}")
if (
model_type in Tuner.supported_classical_models
or not settings.values.new_nn_architecture
):
for key, value in params.items():
print(f" {key}: {value[0]}")
else:
# Get keras model
model = self._models["Model Wrappers"][idx].build(
self._models["Parameter Configurations"][idx]
)
# Iterate through layers
print(" Structural Hyperparameters")
for layer in model.layers:
print(f" Layer: {layer.name}")
# Iterate through layer specific tuned parameters
for key in copy.deepcopy(params).keys():
if layer.name in key:
reduced_key = key.replace(f"{layer.name}_", "")
if reduced_key == "sublayer" or "sublayer" not in reduced_key:
print(
f" {reduced_key}: "
+ f"{params.pop(f'{layer.name}_{reduced_key}')[0]}"
)
# Iterate through parameters to print non-layer hyperparameters
print(" Compile/Fitting Hyperparameters")
for key, value in params.items():
print_param = True
for layer_name in self._models["Model Wrappers"][idx].layer_dict.keys():
if layer_name in key:
print_param = False
break
if print_param:
print(f" {key}: {value[0]}")
model.summary(**kwargs)
[docs]
def nn_network_plot(
self, idx=None, model_type=None, sort_by=None, direction=None, **kwargs
):
"""
Plot NN network.
.. note::
For this to work you must have graphviz installed which can be done
through your package manager.
Parameters
----------
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
kwargs:
Any arguments related to `tensorflow.keras.utils.plot_model() \
<https://www.tensorflow.org/api_docs/python/tf/keras/utils/plot_model>`_
except ``model``.
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx,
model_type=model_type,
sort_by=sort_by,
direction=direction,
nns_only=True,
)
# Get keras model
model = self._models["Model Wrappers"][idx].build(
self._models["Parameter Configurations"][idx]
)
# Run plotter
return tf.keras.utils.plot_model(model, **kwargs)
[docs]
def confusion_matrix(
self,
axs=None,
idx=None,
model_type=None,
sort_by=None,
direction=None,
colorbar=False,
annotate=True,
round=2,
):
"""
Create training and testing confusion matrix.
Parameters
----------
axs: list of 2 matplotlib.pyplot.axis or None, default=None
If not given then an axes are created.
idx: int or None, default=None
The index in the :meth:`pyMAISE.PostProcessor.metrics` pandas.DataFrame.
If ``None``, then ``sort_by`` is used.
model_type: str or None, default=None
The model name to get. Will get the best model predictions based on
``sort_by``.
sort_by: str or None, detault=None
The metric to sort the pandas.DataFrame from
:meth:`pyMAISE.PostProcessor.metrics` by. If ``None`` then
``test r2_score`` is used for :attr:`pyMAISE.ProblemType.REGRESSION`
and ``test accuracy_score`` is used for
:attr:`pyMAISE.ProblemType.CLASSIFICATION`.
direction: 'min', 'max', or None, default=None
The direction to ``sort_by``. It is only required if ``sort_by`` is not
a default metric.
colorbar: Boolean, default=False
Whether to include a colorbar.
annotate: Boolean, default=True
Whether to include annotations (number and percentage).
round: int, default=2
Number of digits to round percentage in annotation.
Returns
-------
axs: tuple of matplotlib.pyplot.axis
The two confusion matrix axes: ``(cm_train, cm_test)``
"""
# Determine the index of the model in the DataFrame
idx = self._get_idx(
idx=idx, model_type=model_type, sort_by=sort_by, direction=direction
)
# Labels
labels = []
for label in self._ytrain.coords[self._ytrain.dims[-1]].values:
labels.append(label.split("_", 1)[1])
# Get predicted and actual outputs
yhat_train = self._models["Train Yhat"][idx]
yhat_test = self._models["Test Yhat"][idx]
ytrain = self._ytrain.values
ytest = self._ytest.values
# Convert one-hot encoding to multilabel
if ytrain.shape[-1] > 1:
yhat_train = np.argmax(yhat_train, axis=-1)
yhat_test = np.argmax(yhat_test, axis=-1)
ytrain = np.argmax(ytrain, axis=-1)
ytest = np.argmax(ytest, axis=-1)
# Confusion matrix
train_cm = confusion_matrix(y_true=ytrain, y_pred=yhat_train)
test_cm = confusion_matrix(y_true=ytest, y_pred=yhat_test)
# Axes
if axs is None:
_, axs = plt.subplots(1, 2)
# Confusion matrix display
ConfusionMatrixDisplay(confusion_matrix=train_cm, display_labels=labels).plot(
include_values=False, ax=axs[0], colorbar=colorbar
)
ConfusionMatrixDisplay(confusion_matrix=test_cm, display_labels=labels).plot(
include_values=False, ax=axs[1], colorbar=colorbar
)
# Add values and percentages
if annotate:
for (i, j), value in np.ndenumerate(train_cm):
axs[0].text(
i,
j,
f"{value}\n{np.round(value / np.sum(train_cm) * 100, round)}%",
ha="center",
va="center",
)
value = test_cm[i, j]
axs[1].text(
i,
j,
f"{value}\n{np.round(value / np.sum(test_cm) * 100, round)}%",
ha="center",
va="center",
)
axs[0].set_title("Training Set")
axs[1].set_title("Testing Set")
return axs