Source code for pyrolite.util.skl.vis

import itertools

import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import scipy.special
import scipy.stats

from pyrolite.util.meta import inargs, subkwargs
from pyrolite.util.plot import DEFAULT_DISC_COLORMAP

from ..log import Handle

logger = Handle(__name__)

try:
    import sklearn.datasets
    import sklearn.manifold
    from sklearn.metrics import confusion_matrix
except ImportError:
    msg = "scikit-learn not installed"
    logger.warning(msg)


[docs]def plot_confusion_matrix(
    *args,
    ax=None,
    classes=[],
    class_order=None,
    normalize=False,
    title="Confusion Matrix",
    cmap=plt.cm.Blues,
    norm=None,
    xlabelrotation=None,
):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.

    Parameters
    ------------
    args : :class:`tuple`
        Data to evaluate and visualise a confusion matrix:

            * A single confusion matrix (n x n)
            * A tuple of (y_test, y_predict)
            * A tuple of (classifier_model, X_test, y_test)

    ax : :class:`matplotlib.axes.Axes`
        Axis to plot on, if one exists.
    classes: :class:`list`
        List of class names to use as labels, and for ordering (see below).
        This should match the order contained within the model. Where a
        classifier model is passed, the classes will be directly extracted.
    class_order : :class:`list`
        List of classes in the desired order along the axes. Should match
        the supplied classes where classes are given, or integer indicies
        for where no named classes are given.
    normalize : :class:`bool`
        Whether to normalize the counts for the confusion matrix to the
        sum of all cases (i.e. be between 0 and 1).
    title : :class:`str`
        Title for the axes.
    cmap : :class:`str` | :class:`matplotlib.color.Colormap`
        Colormap for the visualisation of the confusion matrix.
    norm : :class:`bool`
        Normalization for the colormap visualisation across the confusion matrix.
    xlabelrotation : :class:`float`
        Rotation in degrees for the xaxis labels.

    Returns
    --------
    ax : :class:`matplotlib.axes.Axes`
    """
    if len(args) == 1:
        conf_matrix = args[0]
    elif len(args) in [2, 3]:
        if len(args) == 2:
            y_test, y_predict = args
        else:
            clf, X_test, y_test = args
            y_predict = clf.predict(X_test)
            if not classes:
                if hasattr(args[0], "classes_"):
                    classes = list(args[0].classes_)
        conf_matrix = confusion_matrix(y_test, y_predict)
    else:
        raise NotImplementedError(
            "Supply either i) a confusion matrix, ii) the test and"
            " predict arrays or iii) the classifier, X_test and y_test arrays."
        )

    if not classes:
        classes = np.arange(conf_matrix.shape[0])

    if class_order is not None:
        assert all([c in classes for c in class_order]) and all(
            [c in class_order for c in classes]
        )
        _classes = list(classes)  # for .index
        class_indexes = np.array([_classes.index(c) for c in class_order])
        conf_matrix = conf_matrix[np.ix_(class_indexes, class_indexes)]
        classes = class_order

    if normalize:
        conf_matrix = (
            conf_matrix.astype("float") / conf_matrix.sum(axis=1)[:, np.newaxis]
        )
        if norm is None:
            norm = matplotlib.colors.Normalize(vmin=0, vmax=1.0)
    else:
        # the colormap will need to be normalized across the count range
        norm = matplotlib.colors.Normalize(vmin=0, vmax=np.max(conf_matrix))

    if ax is None:
        fig, ax = plt.subplots(1)

    im = ax.imshow(conf_matrix, interpolation="none", cmap=cmap, norm=norm)
    ax.set_title(title)
    plt.colorbar(im, ax=ax)
    tick_marks = np.arange(len(classes))

    fmt = ".2f" if normalize else "d"
    threshold = conf_matrix.max() / 2.0
    for i, j in itertools.product(
        range(conf_matrix.shape[0]), range(conf_matrix.shape[1])
    ):
        ax.text(
            j,
            i,
            format(conf_matrix[i, j], fmt),
            horizontalalignment="center",
            color="white" if conf_matrix[i, j] > threshold else "black",
        )

    ax.set(
        ylabel="True",
        xlabel="Predicted",
        xticks=tick_marks,
        yticks=tick_marks,
        xticklabels=classes,
        yticklabels=classes,
    )
    if xlabelrotation is not None:
        plt.setp(ax.get_xticklabels(), rotation=xlabelrotation)
    ax.grid(False)
    plt.tight_layout()
    return ax


[docs]def plot_gs_results(gs, xvar=None, yvar=None):
    """Plots the results from a GridSearch showing location of optimum in 2D."""
    labels = gs.param_grid.keys()
    grid_items = list(gs.param_grid.items())
    if (
        len(grid_items) == 1
    ):  # if there's only one item, there's only one way to plot it.
        (xvar, xx) = grid_items[0]
        (yvar, yy) = "", np.array([0])
    else:
        if xvar is None and yvar is None:
            (yvar, yy), (xvar, xx) = [(k, v) for (k, v) in grid_items][:3]
        elif xvar is not None and yvar is not None:
            yy, xx = gs.param_grid[yvar], gs.param_grid[xvar]
        else:
            if xvar is not None:
                xx = gs.param_grid[xvar]
                (yvar, yy) = [(k, v) for (k, v) in grid_items if not k == xvar][0]
            else:
                yy = gs.param_grid[yvar]
                (xvar, xx) = [(k, v) for (k, v) in grid_items if not k == yvar][0]
    xx, yy = np.array(xx), np.array(yy)
    other_keys = [i for i in labels if i not in [xvar, yvar]]
    if other_keys:
        pass
    else:
        results = np.array(gs.cv_results_["mean_test_score"]).reshape(xx.size, yy.size)
    fig, ax = plt.subplots(1)
    ax.imshow(results.T, cmap=plt.cm.Blues)

    ax.set(
        xlabel=xvar,
        ylabel=yvar,
        xticks=np.arange(len(xx)),
        yticks=np.arange(len(yy)),
        xticklabels=["{:01.2g}".format(i) for i in xx],
        yticklabels=["{:01.2g}".format(i) for i in yy],
    )
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
    ax.invert_yaxis()

    locmax = np.where(results == np.nanmax(results))
    x, y = locmax
    ax.scatter(x, y, marker="D", s=100, c="k")
    return ax


[docs]def alphas_from_multiclass_prob(probs, method="entropy", alpha=1.0):
    """
    Take an array of multiclass probabilities and map to an alpha variable.

    Parameters
    -----------
    probs : :class:`numpy.ndarray`
        Multiclass probabilities with shape (nsamples, nclasses).

    method : :class:`str`, :code:`entropy` | :code:`kl_div`
        Method for mapping probabilities to alphas.
    alpha : :class:`float`
        Optional specification of overall maximum alpha value.

    Returns
    ----------
    a : :class:`numpy.ndarray`
        Alpha values for each sample with shape (nsamples, 1).
    """
    netzero = 1.0 / probs.shape[1] * np.ones(probs.shape[1])
    if method == "entropy":
        # uniform distribution has maximum entropy
        max_H = scipy.stats.entropy(netzero)
        H = np.apply_along_axis(scipy.stats.entropy, 1, probs)
        min_H = np.min(H, axis=0)
        rel_H = (H - min_H) / (max_H - min_H)  # between zero and one
        a = 1.0 - rel_H
        a *= alpha
    else:
        # alpha as sum of information gain
        a = np.apply_along_axis(scipy.special.kl_div, 1, probs, netzero).sum(axis=1)
        a = a / np.max(a, axis=0)
        a *= alpha
    return a


[docs]def plot_mapping(
    X,
    Y,
    mapping=None,
    ax=None,
    cmap=None,
    alpha=1.0,
    s=10,
    alpha_method="entropy",
    **kwargs,
):
    """
    Parameters
    ----------
    X : :class:`numpy.ndarray`
        Coordinates in multidimensional space.
    Y : :class:`numpy.ndarray` | :class:`sklearn.base.BaseEstimator`
        An array of targets, or a method to obtain such an array of targets
        via :func:`Y.predict`. Transformers with probabilistic output
        (via :func:`Y.predict_proba`) will have these probability estimates accounted
        for via the alpha channel.
    mapping : :class:`numpy.ndarray` | :class:`~sklearn.base.TransformerMixin`
        Mapped points or transformer to create mapped points.
    ax : :class:`matplotlib.axes.Axes`
        Axes to plot on.
    cmap : :class:`matplotlib.cm.ListedColormap`
        Colormap to use for the classification visualisation (ideally this should be
        a discrete colormap unless the classes are organised ).
    alpha : :class:`float`
        Coefficient for alpha.
    alpha_method : :code:`'entropy' or 'kl_div'`
        Method to map class probabilities to alpha. :code:`'entropy'` uses a measure of
        entropy relative to null-scenario of equal distribution across classes, while
        :code:`'kl_div'` calculates the information gain relative to the same
        null-scenario.

    Returns
    -------
    ax : :class:`~matplotlib.axes.Axes`
        Axes on which the mapping is plotted.
    tfm : :class:`~sklearn.base.BaseEstimator`
        Fitted mapping transform.

    Todo
    ------

        * Option to generate colors for individual classes

            This could be based on the distances between their centres in
            multidimensional space (or low dimensional mapping of this space),
            enabling a continuous (n-dimensional) colormap to be used
            to show similar classes, in addition to classification confidence.
    """
    X_ = X.copy()  # avoid modifying input array
    if mapping is None:
        tfm = sklearn.manifold.MDS
        tfm_kwargs = {k: v for k, v in kwargs.items() if inargs(k, tfm)}
        tfm = tfm(n_components=2, metric=True, **tfm_kwargs)
        mapped = tfm.fit_transform(X_)
    elif isinstance(mapping, str):
        if mapping.lower() == "mds":
            cls = sklearn.manifold.MDS
            kw = dict(n_components=2, metric=True)
        elif mapping.lower() == "isomap":
            # not necessarily consistent orientation, but consistent shape
            cls = sklearn.manifold.Isomap
            kw = dict(n_components=2)
        elif mapping.lower() == "tsne":
            # likely need to optimise!
            cls = sklearn.manifold.TSNE
            kw = dict(n_components=2)
        else:
            raise NotImplementedError
        tfm = cls(**{**kw, **subkwargs(kwargs, cls)})
        mapped = tfm.fit_transform(X_)
    elif isinstance(
        mapping, (sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
    ):  # manifold transforms can be either
        tfm = mapping
        mapped = tfm.fit_transform(X_)
    else:  # mapping is already performedata, expect a numpy.ndarray
        mapped = mapping
        tfm = None
    assert mapped.shape[0] == X_.shape[0]

    if ax is None:
        fig, ax = plt.subplots(1, **kwargs)

    if isinstance(Y, (np.ndarray, list)):
        c = Y  # need to encode alpha here
    elif isinstance(Y, (sklearn.base.BaseEstimator)):
        # need to split this into  multiple methods depending on form of classifier
        if hasattr(Y, "predict_proba"):
            classes = Y.predict(X_)
            cmap = cmap or DEFAULT_DISC_COLORMAP
            c = cmap(classes)
            ps = Y.predict_proba(X_)
            a = alphas_from_multiclass_prob(ps, method=alpha_method, alpha=alpha)
            c[:, -1] = a
            cmap = None
        else:
            c = Y.predict(X)
            cmap = cmap or DEFAULT_DISC_COLORMAP

    ax.scatter(*mapped.T, c=c, s=s, edgecolors="none", cmap=cmap)
    return ax, tfm, mapped