Source code for pyrolite.util.plot.density

"""
Functions for dealing with kernel density estimation.

Attributes
----------
USE_PCOLOR : :class:`bool`
    Option to use the :func:`matplotlib.pyplot.pcolor` function in place
    of :func:`matplotlib.pyplot.pcolormesh`.
"""

import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
from numpy.linalg import LinAlgError

from ..distributions import sample_kde
from ..log import Handle
from ..math import interpolate_line, linspc_, logspc_
from ..meta import subkwargs
from .grid import bin_centres_to_edges

logger = Handle(__name__)

try:
    import statsmodels.api as sm

    HAVE_SM = True
except ImportError:
    HAVE_SM = False

USE_PCOLOR = False



[docs]
def get_axis_density_methods(ax):
    """
    Get the relevant density and contouring methods for a given axis.

    Parameters
    -----------
    ax : :class:`matplotlib.axes.Axes` | :class:`mpltern.ternary.TernaryAxes`
        Axis to check.

    Returns
    --------
    pcolor, contour, contourf
        Relevant functions for this axis.
    """
    if ax.name == "ternary":
        pcolor = ax.tripcolor
        contour = ax.tricontour
        contourf = ax.tricontourf
    else:
        if USE_PCOLOR:
            pcolor = ax.pcolor
        else:
            pcolor = ax.pcolormesh
        contour = ax.contour
        contourf = ax.contourf
    return pcolor, contour, contourf




[docs]
def percentile_contour_values_from_meshz(
    z, percentiles=[0.95, 0.66, 0.33], resolution=1000
):
    """
    Integrate a probability density distribution Z(X,Y) to obtain contours in Z which
    correspond to specified percentile contours. Contour values will be returned
    with the same order as the inputs.

    Parameters
    ----------
    z : :class:`numpy.ndarray`
        Probability density function over x, y.
    percentiles : :class:`numpy.ndarray`
        Percentile values for which to create contours.
    resolution : :class:`int`
        Number of bins for thresholds between 0. and max(Z)

    Returns
    -------
    labels : :class:`list`
        Labels for contours (percentiles, if above minimum z value).
    contours : :class:`list`
        Contour height values.

    Todo
    -----
    This may error for a list of percentiles where one or more requested
    values are below the miniumum threshold. The exception handling should
    be updated to cater for arrays - where some of the values may be above
    the minimum.
    """
    # Integral approach from https://stackoverflow.com/a/37932566
    t = np.linspace(0.0, z.max(), resolution)
    integral = ((z >= t[:, None, None]) * z).sum(axis=(1, 2))
    f = scipy.interpolate.interp1d(integral, t)
    try:
        t_contours = f(np.array(percentiles) * z.sum())
        return percentiles, t_contours
    except ValueError:
        # occurrs on the low-end of percentiles (high parts of distribution)
        # maximum positions of distributions are limited by the resolution
        # at some point there's a step down to zero
        logger.debug(
            "Percentile contour below minimum for given resolution"
            "Returning Minimium."
        )
        non_one = integral[~np.isclose(integral, np.ones_like(integral))]
        return ["min"], f(np.array([np.nanmax(non_one)]))




[docs]
def plot_Z_percentiles(
    *coords,
    zi=None,
    percentiles=[0.95, 0.66, 0.33],
    ax=None,
    extent=None,
    fontsize=8,
    cmap=None,
    colors=None,
    linewidths=None,
    linestyles=None,
    contour_labels=None,
    label_contours=True,
    **kwargs,
):
    """
    Plot percentile contours onto a 2D  (scaled or unscaled) probability density
    distribution Z over X,Y.

    Parameters
    ------------
    coords : :class:`numpy.ndarray`
        Arrays of (x, y) or (a, b, c) coordinates.
    z : :class:`numpy.ndarray`
        Probability density function over x, y.
    percentiles : :class:`list`
        Percentile values for which to create contours.
    ax : :class:`matplotlib.axes.Axes`, :code:`None`
        Axes on which to plot. If none given, will create a new Axes instance.
    extent : :class:`list`, :code:`None`
        List or np.ndarray in the form [-x, +x, -y, +y] over which the image extends.
    fontsize : :class:`float`
        Fontsize for the contour labels.
    cmap : :class:`matplotlib.colors.ListedColormap`
        Color map for the contours and contour labels.
    colors : :class:`str` | :class:`list`
        Colors for the contours, can optionally be specified *in place of* `cmap.`
    linewidths : :class:`str` | :class:`list`
        Widths of contour lines.
    linestyles : :class:`str` | :class:`list`
        Styles for contour lines.
    contour_labels : :class:`dict` | :class:`list`
        Labels to assign to contours, organised by level.
    label_contours :class:`bool`
        Whether to add text labels to individual contours.

    Returns
    -------
    :class:`matplotlib.contour.QuadContourSet`
        Plotted and formatted contour set.

    Notes
    -----
    When the contours are percentile based, high percentile contours tend to get
    washed our with colormapping - consider adding different controls on coloring,
    especially where there are only one or two contours specified. One way to do
    this would be via the string based keyword argument `colors` for plt.contour, with
    an adaption for non-string colours which post-hoc modifies the contour lines
    based on the specified colours?
    """
    if ax is None:
        fig, ax = plt.subplots(1, figsize=(6, 6))

    if extent is None:
        # if len(coords) == 2:  # currently won't work for ternary
        extent = np.array([[np.min(c), np.max(c)] for c in coords[:2]]).flatten()

    clabels, contour_values = percentile_contour_values_from_meshz(
        zi, percentiles=percentiles
    )

    pcolor, contour, contourf = get_axis_density_methods(ax)
    if colors is not None:  # colors are explicitly specified
        cmap = None

    # contours will need to increase for matplotlib, so we check the ordering here.
    ordering = np.argsort(contour_values)
    # sort out multi-object properties - reorder to fit the increasing order requirement
    cntr_config = {}
    for p, v in [
        ("colors", colors),
        ("linestyles", linestyles),
        ("linewidths", linewidths),
    ]:
        if v is not None:
            if isinstance(v, (list, tuple)):
                # reorder the list
                cntr_config[p] = [v[ix] for ix in ordering]
            else:
                cntr_config[p] = v

    cs = contour(
        *coords,
        zi,
        levels=contour_values[ordering],  # must increase
        cmap=cmap,
        **{**cntr_config, **kwargs},
    )
    if label_contours:
        fs = kwargs.pop("fontsize", None) or 8
        lbls = ax.clabel(cs, fontsize=fs, inline_spacing=0)
        z_contours = sorted(list(set([float(l.get_text()) for l in lbls])))
        trans = {
            float(t): str(p)
            for t, p in zip(z_contours, sorted(percentiles, reverse=True))
        }
        if contour_labels is None:
            _labels = [trans[float(l.get_text())] for l in lbls]
        else:
            if isinstance(contour_labels, dict):
                # get the labels from the dictionary provided
                contour_labels = {str(k): str(v) for k, v in contour_labels.items()}
                _labels = [contour_labels[trans[float(l.get_text())]] for l in lbls]
            else:  # a list is specified in the same order as the contours are drawn
                _labels = contour_labels

        for l, t in zip(lbls, _labels):
            l.set_text(t)
    return cs




[docs]
def conditional_prob_density(
    y,
    x=None,
    logy=False,
    resolution=5,
    bins=50,
    yextent=None,
    rescale=True,
    mode="binkde",
    ret_centres=False,
    **kwargs,
):
    """
    Estimate the conditional probability density of one dependent variable.

    Parameters
    -----------
    y : :class:`numpy.ndarray`
        Dependent variable for which to calculate conditional probability P(y | X=x)
    x : :class:`numpy.ndarray`, :code:`None`
        Optionally-specified independent index.
    logy : :class:`bool`
        Whether to use a logarithmic bin spacing on the y axis.
    resolution : :class:`int`
        Points added per segment via interpolation along the x axis.
    bins : :class:`int`
        Bins for histograms and grids along the independent axis.
    yextent : :class:`tuple`
        Extent in the y direction.
    rescale : :class:`bool`
        Whether to rescale bins to give the same max Z across x.
    mode : :class:`str`
        Mode of computation.

            If mode is :code:`"ckde"`, use
            :func:`statsmodels.nonparametric.KDEMultivariateConditional` to compute a
            conditional kernel density estimate. If mode is :code:`"kde"`, use a normal
            gaussian kernel density estimate. If mode is :code:`"binkde"`, use a gaussian
            kernel density estimate over y for each bin. If mode is :code:`"hist"`,
            compute a histogram.
    ret_centres : :class:`bool`
        Whether to return bin centres in addtion to histogram edges,
        e.g. for later contouring.

    Returns
    -------
    :class:`tuple` of :class:`numpy.ndarray`
        :code:`x` bin edges :code:`xe`, :code:`y` bin edges :code:`ye`, histogram/density
        estimates :code:`Z`. If :code:`ret_centres` is :code:`True`, the last two return
        values will contain the bin centres :code:`xi`, :code:`yi`.
    """
    # check for shapes
    assert not ((x is None) and (y is None))
    if y is None:  # Swap the variables. Create an index for x
        y = x
        x = None

    nvar = y.shape[1]
    if x is None:  # Create a simple arange-based index
        x = np.arange(nvar)

    if resolution:  # this is where REE previously broke down
        x, y = interpolate_line(x, y, n=resolution, logy=logy)

    if not x.shape == y.shape:
        try:  # x is an index to be tiled
            assert y.shape[1] == x.shape[0]
            x = np.tile(x, y.shape[0]).reshape(*y.shape)
        except AssertionError:
            # shape mismatch
            msg = "Mismatched shapes: x: {}, y: {}. Needs either ".format(
                x.shape, y.shape
            )
            raise AssertionError(msg)

    xx = x[0]
    if yextent is None:
        ymin, ymax = np.nanmin(y), np.nanmax(y)
    else:
        ymin, ymax = np.nanmin(yextent), np.nanmax(yextent)

    # remove non finite values for kde functions
    ystep = [(ymax - ymin) / bins, (ymax / ymin) / bins][logy]
    yy = [linspc_, logspc_][logy](ymin, ymax, step=ystep, bins=bins)
    if logy:  # make grid equally spaced, evaluate in log then transform back
        y, yy = np.log(y), np.log(yy)

    xi, yi = np.meshgrid(xx, yy)
    # bin centres may be off centre, but will be in the bins.
    xe, ye = np.meshgrid(bin_centres_to_edges(xx, sort=False), bin_centres_to_edges(yy))

    kde_kw = subkwargs(kwargs, sample_kde)

    if mode == "ckde":
        fltr = np.isfinite(y.flatten()) & np.isfinite(x.flatten())
        x, y = x.flatten()[fltr], y.flatten()[fltr]
        if HAVE_SM:
            dens_c = sm.nonparametric.KDEMultivariateConditional(
                endog=[y], exog=[x], dep_type="c", indep_type="c", bw="normal_reference"
            )
        else:
            raise ImportError("Requires statsmodels.")
        # statsmodels pdf takes values in reverse order
        zi = dens_c.pdf(yi.flatten(), xi.flatten()).reshape(xi.shape)
    elif mode == "binkde":  # calclate a kde per bin
        zi = np.zeros(xi.shape)
        for bin_index in range(x.shape[1]):  # bins along the x-axis
            # if np.isfinite(y[:, bin_index]).any(): # bins can be empty
            src = y[:, bin_index]
            sample_at = yi[:, bin_index]
            zi[:, bin_index] = sample_kde(src, sample_at, **kde_kw)
            # else:
            # pass
    elif mode == "kde":  # eqivalent to 2D KDE for scatter x,y * resolution
        xkde = sample_kde(x[0], x[0])  # marginal density along x
        src = np.vstack([x.flatten(), y.flatten()]).T
        sample_at = [xi, yi]  # meshgrid logistics dealt with by sample_kde
        try:
            zi = sample_kde(src, sample_at, **kde_kw)
        except LinAlgError:  # singular matrix, try adding miniscule noise on x?
            logger.warn("Singular Matrix")
            src[:, 0] += np.random.randn(*x.shape) * np.finfo(np.float64).eps
        zi = sample_kde(src, sample_at, **kde_kw)
        zi.reshape(xi.shape)
        zi /= xkde[np.newaxis, :]
    elif "hist" in mode.lower():  # simply compute the histogram
        # histogram monotonically increasing bins, requires logbins be transformed
        # calculate histogram in logy if needed
        bins = [bin_centres_to_edges(xx), bin_centres_to_edges(yy)]
        H, xe, ye = np.histogram2d(x.flatten(), y.flatten(), bins=bins)
        zi = H.T.reshape(xi.shape)
    else:
        raise NotImplementedError

    if rescale:  # rescale bins across x
        xzfactors = np.nanmax(zi) / np.nanmax(zi, axis=0)
        zi *= xzfactors[np.newaxis, :]

    if logy:
        yi, ye = np.exp(yi), np.exp(ye)
    if ret_centres:
        return xe, ye, zi, xi, yi
    return xe, ye, zi