"""
Functions for dealing with kernel density estimation.
Attributes
----------
USE_PCOLOR : :class:`bool`
Option to use the :func:`matplotlib.pyplot.pcolor` function in place
of :func:`matplotlib.pyplot.pcolormesh`.
"""
import matplotlib.pyplot as plt
import numpy as np
import scipy.interpolate
from numpy.linalg import LinAlgError
from ..distributions import sample_kde
from ..log import Handle
from ..math import interpolate_line, linspc_, logspc_
from ..meta import subkwargs
from .grid import bin_centres_to_edges
logger = Handle(__name__)
try:
import statsmodels.api as sm
HAVE_SM = True
except ImportError:
HAVE_SM = False
USE_PCOLOR = False
[docs]def get_axis_density_methods(ax):
"""
Get the relevant density and contouring methods for a given axis.
Parameters
-----------
ax : :class:`matplotlib.axes.Axes` | :class:`mpltern.ternary.TernaryAxes`
Axis to check.
Returns
--------
pcolor, contour, contourf
Relevant functions for this axis.
"""
if ax.name == "ternary":
pcolor = ax.tripcolor
contour = ax.tricontour
contourf = ax.tricontourf
else:
if USE_PCOLOR:
pcolor = ax.pcolor
else:
pcolor = ax.pcolormesh
contour = ax.contour
contourf = ax.contourf
return pcolor, contour, contourf
[docs]def percentile_contour_values_from_meshz(
z, percentiles=[0.95, 0.66, 0.33], resolution=1000
):
"""
Integrate a probability density distribution Z(X,Y) to obtain contours in Z which
correspond to specified percentile contours. Contour values will be returned
with the same order as the inputs.
Parameters
----------
z : :class:`numpy.ndarray`
Probability density function over x, y.
percentiles : :class:`numpy.ndarray`
Percentile values for which to create contours.
resolution : :class:`int`
Number of bins for thresholds between 0. and max(Z)
Returns
-------
labels : :class:`list`
Labels for contours (percentiles, if above minimum z value).
contours : :class:`list`
Contour height values.
Todo
-----
This may error for a list of percentiles where one or more requested
values are below the miniumum threshold. The exception handling should
be updated to cater for arrays - where some of the values may be above
the minimum.
"""
# Integral approach from https://stackoverflow.com/a/37932566
t = np.linspace(0.0, z.max(), resolution)
integral = ((z >= t[:, None, None]) * z).sum(axis=(1, 2))
f = scipy.interpolate.interp1d(integral, t)
try:
t_contours = f(np.array(percentiles) * z.sum())
return percentiles, t_contours
except ValueError:
# occurrs on the low-end of percentiles (high parts of distribution)
# maximum positions of distributions are limited by the resolution
# at some point there's a step down to zero
logger.debug(
"Percentile contour below minimum for given resolution"
"Returning Minimium."
)
non_one = integral[~np.isclose(integral, np.ones_like(integral))]
return ["min"], f(np.array([np.nanmax(non_one)]))
[docs]def plot_Z_percentiles(
*coords,
zi=None,
percentiles=[0.95, 0.66, 0.33],
ax=None,
extent=None,
fontsize=8,
cmap=None,
colors=None,
linewidths=None,
linestyles=None,
contour_labels=None,
label_contours=True,
**kwargs
):
"""
Plot percentile contours onto a 2D (scaled or unscaled) probability density
distribution Z over X,Y.
Parameters
------------
coords : :class:`numpy.ndarray`
Arrays of (x, y) or (a, b, c) coordinates.
z : :class:`numpy.ndarray`
Probability density function over x, y.
percentiles : :class:`list`
Percentile values for which to create contours.
ax : :class:`matplotlib.axes.Axes`, :code:`None`
Axes on which to plot. If none given, will create a new Axes instance.
extent : :class:`list`, :code:`None`
List or np.ndarray in the form [-x, +x, -y, +y] over which the image extends.
fontsize : :class:`float`
Fontsize for the contour labels.
cmap : :class:`matplotlib.colors.ListedColormap`
Color map for the contours and contour labels.
colors : :class:`str` | :class:`list`
Colors for the contours, can optionally be specified *in place of* `cmap.`
linewidths : :class:`str` | :class:`list`
Widths of contour lines.
linestyles : :class:`str` | :class:`list`
Styles for contour lines.
contour_labels : :class:`dict` | :class:`list`
Labels to assign to contours, organised by level.
label_contours :class:`bool`
Whether to add text labels to individual contours.
Returns
-------
:class:`matplotlib.contour.QuadContourSet`
Plotted and formatted contour set.
Notes
-----
When the contours are percentile based, high percentile contours tend to get
washed our with colormapping - consider adding different controls on coloring,
especially where there are only one or two contours specified. One way to do
this would be via the string based keyword argument `colors` for plt.contour, with
an adaption for non-string colours which post-hoc modifies the contour lines
based on the specified colours?
"""
if ax is None:
fig, ax = plt.subplots(1, figsize=(6, 6))
if extent is None:
# if len(coords) == 2: # currently won't work for ternary
extent = np.array([[np.min(c), np.max(c)] for c in coords[:2]]).flatten()
clabels, contour_values = percentile_contour_values_from_meshz(
zi, percentiles=percentiles
)
pcolor, contour, contourf = get_axis_density_methods(ax)
if colors is not None: # colors are explicitly specified
cmap = None
# contours will need to increase for matplotlib, so we check the ordering here.
ordering = np.argsort(contour_values)
# sort out multi-object properties - reorder to fit the increasing order requirement
cntr_config = {}
for p, v in [
("colors", colors),
("linestyles", linestyles),
("linewidths", linewidths),
]:
if v is not None:
if isinstance(v, (list, tuple)):
# reorder the list
cntr_config[p] = [v[ix] for ix in ordering]
else:
cntr_config[p] = v
cs = contour(
*coords,
zi,
levels=contour_values[ordering], # must increase
cmap=cmap,
**{**cntr_config, **kwargs}
)
if label_contours:
fs = kwargs.pop("fontsize", None) or 8
lbls = ax.clabel(cs, fontsize=fs, inline_spacing=0)
z_contours = sorted(list(set([float(l.get_text()) for l in lbls])))
trans = {
float(t): str(p)
for t, p in zip(z_contours, sorted(percentiles, reverse=True))
}
if contour_labels is None:
_labels = [trans[float(l.get_text())] for l in lbls]
else:
if isinstance(contour_labels, dict):
# get the labels from the dictionary provided
contour_labels = {str(k): str(v) for k, v in contour_labels.items()}
_labels = [contour_labels[trans[float(l.get_text())]] for l in lbls]
else: # a list is specified in the same order as the contours are drawn
_labels = contour_labels
for l, t in zip(lbls, _labels):
l.set_text(t)
return cs
[docs]def conditional_prob_density(
y,
x=None,
logy=False,
resolution=5,
bins=50,
yextent=None,
rescale=True,
mode="binkde",
ret_centres=False,
**kwargs
):
"""
Estimate the conditional probability density of one dependent variable.
Parameters
-----------
y : :class:`numpy.ndarray`
Dependent variable for which to calculate conditional probability P(y | X=x)
x : :class:`numpy.ndarray`, :code:`None`
Optionally-specified independent index.
logy : :class:`bool`
Whether to use a logarithmic bin spacing on the y axis.
resolution : :class:`int`
Points added per segment via interpolation along the x axis.
bins : :class:`int`
Bins for histograms and grids along the independent axis.
yextent : :class:`tuple`
Extent in the y direction.
rescale : :class:`bool`
Whether to rescale bins to give the same max Z across x.
mode : :class:`str`
Mode of computation.
If mode is :code:`"ckde"`, use
:func:`statsmodels.nonparametric.KDEMultivariateConditional` to compute a
conditional kernel density estimate. If mode is :code:`"kde"`, use a normal
gaussian kernel density estimate. If mode is :code:`"binkde"`, use a gaussian
kernel density estimate over y for each bin. If mode is :code:`"hist"`,
compute a histogram.
ret_centres : :class:`bool`
Whether to return bin centres in addtion to histogram edges,
e.g. for later contouring.
Returns
-------
:class:`tuple` of :class:`numpy.ndarray`
:code:`x` bin edges :code:`xe`, :code:`y` bin edges :code:`ye`, histogram/density
estimates :code:`Z`. If :code:`ret_centres` is :code:`True`, the last two return
values will contain the bin centres :code:`xi`, :code:`yi`.
"""
# check for shapes
assert not ((x is None) and (y is None))
if y is None: # Swap the variables. Create an index for x
y = x
x = None
nvar = y.shape[1]
if x is None: # Create a simple arange-based index
x = np.arange(nvar)
if resolution: # this is where REE previously broke down
x, y = interpolate_line(x, y, n=resolution, logy=logy)
if not x.shape == y.shape:
try: # x is an index to be tiled
assert y.shape[1] == x.shape[0]
x = np.tile(x, y.shape[0]).reshape(*y.shape)
except AssertionError:
# shape mismatch
msg = "Mismatched shapes: x: {}, y: {}. Needs either ".format(
x.shape, y.shape
)
raise AssertionError(msg)
xx = x[0]
if yextent is None:
ymin, ymax = np.nanmin(y), np.nanmax(y)
else:
ymin, ymax = np.nanmin(yextent), np.nanmax(yextent)
# remove non finite values for kde functions
ystep = [(ymax - ymin) / bins, (ymax / ymin) / bins][logy]
yy = [linspc_, logspc_][logy](ymin, ymax, step=ystep, bins=bins)
if logy: # make grid equally spaced, evaluate in log then transform back
y, yy = np.log(y), np.log(yy)
xi, yi = np.meshgrid(xx, yy)
# bin centres may be off centre, but will be in the bins.
xe, ye = np.meshgrid(bin_centres_to_edges(xx, sort=False), bin_centres_to_edges(yy))
kde_kw = subkwargs(kwargs, sample_kde)
if mode == "ckde":
fltr = np.isfinite(y.flatten()) & np.isfinite(x.flatten())
x, y = x.flatten()[fltr], y.flatten()[fltr]
if HAVE_SM:
dens_c = sm.nonparametric.KDEMultivariateConditional(
endog=[y], exog=[x], dep_type="c", indep_type="c", bw="normal_reference"
)
else:
raise ImportError("Requires statsmodels.")
# statsmodels pdf takes values in reverse order
zi = dens_c.pdf(yi.flatten(), xi.flatten()).reshape(xi.shape)
elif mode == "binkde": # calclate a kde per bin
zi = np.zeros(xi.shape)
for bin_index in range(x.shape[1]): # bins along the x-axis
# if np.isfinite(y[:, bin_index]).any(): # bins can be empty
src = y[:, bin_index]
sample_at = yi[:, bin_index]
zi[:, bin_index] = sample_kde(src, sample_at, **kde_kw)
# else:
# pass
elif mode == "kde": # eqivalent to 2D KDE for scatter x,y * resolution
xkde = sample_kde(x[0], x[0]) # marginal density along x
src = np.vstack([x.flatten(), y.flatten()]).T
sample_at = [xi, yi] # meshgrid logistics dealt with by sample_kde
try:
zi = sample_kde(src, sample_at, **kde_kw)
except LinAlgError: # singular matrix, try adding miniscule noise on x?
logger.warn("Singular Matrix")
src[:, 0] += np.random.randn(*x.shape) * np.finfo(np.float64).eps
zi = sample_kde(src, sample_at, **kde_kw)
zi.reshape(xi.shape)
zi /= xkde[np.newaxis, :]
elif "hist" in mode.lower(): # simply compute the histogram
# histogram monotonically increasing bins, requires logbins be transformed
# calculate histogram in logy if needed
bins = [bin_centres_to_edges(xx), bin_centres_to_edges(yy)]
H, xe, ye = np.histogram2d(x.flatten(), y.flatten(), bins=bins)
zi = H.T.reshape(xi.shape)
else:
raise NotImplementedError
if rescale: # rescale bins across x
xzfactors = np.nanmax(zi) / np.nanmax(zi, axis=0)
zi *= xzfactors[np.newaxis, :]
if logy:
yi, ye = np.exp(yi), np.exp(ye)
if ret_centres:
return xe, ye, zi, xi, yi
return xe, ye, zi