Source code for pyrolite.util.pd

import hashlib
from pathlib import Path

import numpy as np
import pandas as pd

from .log import Handle
from .meta import subkwargs

logger = Handle(__name__)


[docs]def drop_where_all_empty(df):
    """
    Drop rows and columns which are completely empty.

    Parameters
    ----------
    df : :class:`pandas.DataFrame` | :class:`pandas.Series`
        Pandas object to ensure is in the form of a series.
    """
    for ix in range(len(df.axes)):
        df = df.dropna(how="all", axis=ix)
    return df


[docs]def read_table(filepath, index_col=0, **kwargs):
    """
    Read tabluar data from an excel or csv text-based file.

    Parameters
    ------------
    filepath : :class:`str` | :class:`pathlib.Path`
        Path to file.

    Returns
    --------
    :class:`pandas.DataFrame`
    """
    filepath = Path(filepath)
    ext = filepath.suffix.replace(".", "")
    assert ext in ["xls", "xlsx", "csv"]
    if ext in ["xls", "xlsx"]:
        reader, kw = pd.read_excel, dict(engine="openpyxl")
    elif ext in ["csv"]:
        reader, kw = pd.read_csv, {}
    else:
        raise NotImplementedError("Only .xls* and .csv currently supported.")
    df = reader(
        str(filepath), index_col=index_col, **subkwargs({**kw, **kwargs}, reader)
    )
    df = drop_where_all_empty(df)
    return df


[docs]def column_ordered_append(df1, df2, **kwargs):
    """
    Appends one dataframe to another, preserving the column order of the
    first and adding new columns on the right. Also accepts and passes on
    standard keyword arguments for pd.DataFrame.append.

    Parameters
    ------------
    df1 : :class:`pandas.DataFrame`
        The dataframe for which columns order is preserved in the output.
    df2 : :class:`pandas.DataFrame`
        The dataframe for which new columns are appended to the output.

    Returns
    --------
    :class:`pandas.DataFrame`
    """
    outcols = list(df1.columns) + [i for i in df2.columns if not i in df1.columns]
    return pd.concat([df1, df2], axis=0, **kwargs).reindex(columns=outcols)


[docs]def accumulate(dfs, ignore_index=False, trace_source=False, names=[]):
    """
    Accumulate an iterable containing multiple :class:`pandas.DataFrame` to a single
    frame.

    Parameters
    -----------
    dfs : :class:`list`
        Sequence of dataframes.
    ignore_index : :class:`bool`
        Whether to ignore the indexes upon joining.
    trace_source : :class:`bool`
        Whether to retain a reference to the source of the data rows.
    names : :class:`list`
        Names to use in place of indexes for source names.

    Returns
    --------
    :class:`pandas.DataFrame`
        Accumulated dataframe.
    """
    acc = None
    for ix, df in enumerate(dfs):
        if trace_source:
            if names:
                df["src_idx"] = names[ix]
            else:
                df["src_idx"] = ix
        if acc is None:
            acc = df
        else:
            acc = column_ordered_append(acc, df, ignore_index=ignore_index)
    return acc


[docs]def to_frame(ser):
    """
    Simple utility for converting to :class:`pandas.DataFrame`.

    Parameters
    ----------
    ser : :class:`pandas.Series` | :class:`pandas.DataFrame`
        Pandas object to ensure is in the form of a dataframe.

    Returns
    --------
    :class:`pandas.DataFrame`
    """

    if isinstance(ser, pd.Series):  # using series instead of dataframe
        df = ser.to_frame().T
    elif isinstance(ser, pd.DataFrame):  # 1 column slice
        if ser.columns.size == 1:
            df = ser.T
        else:
            df = ser
    else:
        msg = "Conversion from {} to dataframe not yet implemented".format(type(ser))
        raise NotImplementedError(msg)

    return df


[docs]def to_ser(df):
    """
    Simple utility for converting single column :class:`pandas.DataFrame`
    to :class:`pandas.Series`.

    Parameters
    ----------
    df : :class:`pandas.DataFrame` | :class:`pandas.Series`
        Pandas object to ensure is in the form of a series.

    Returns
    --------
    :class:`pandas.Series`
    """
    if isinstance(df, pd.Series):  # passed series instead of dataframe
        ser = df
    elif isinstance(df, pd.DataFrame):
        assert (df.columns.size == 1) or (
            df.index.size == 1
        ), """Can't convert DataFrame to Series:
              either columns or index need to have size 1."""
        if df.columns.size == 1:
            ser = df.iloc[:, 0]
        else:
            ser = df.iloc[0, :]
    else:
        msg = "Conversion from {} to series not yet implemented".format(type(df))
        raise NotImplementedError(msg)

    return ser


[docs]def to_numeric(df, errors: str = "coerce", exclude=["float", "int"]):
    """
    Converts non-numeric columns to numeric type where possible.

    Notes
    -----

    Avoid using .loc or .iloc on the LHS to make sure that data dtypes
    are propagated.
    """
    cols = df.select_dtypes(exclude=exclude).columns
    df[cols] = df.loc[:, cols].apply(pd.to_numeric, errors=errors)
    return df


[docs]def zero_to_nan(df, rtol=1e-5, atol=1e-8):
    """
    Replace floats close, less or equal to zero with np.nan in a dataframe.

    Parameters
    ------------
    df : :class:`pandas.DataFrame`
        DataFrame to censor.
    rtol : :class:`float`
        The relative tolerance parameter.
    atol : :class:`float`
        The absolute  tolerance parameter.

    Returns
    --------
    :class:`pandas.DataFrame`
        Censored DataFrame.
    """
    cols = [
        name for (name, type) in zip(df.columns, df.dtypes) if isinstance(type, float)
    ]
    df.loc[:, cols] = np.where(
        np.isclose(df[cols].values, 0.0, rtol=rtol, atol=atol), np.nan, df[cols].values
    )
    df.loc[:, cols] = np.where(df[cols].values < 0.0, np.nan, df[cols].values)
    return df


[docs]def outliers(
    df,
    cols=[],
    detect=lambda x, quantile, qntls: (
        (x > quantile.loc[qntls[0], x.name]) & (x < quantile.loc[qntls[1], x.name])
    ),
    quantile_select=(0.02, 0.98),
    logquantile=False,
    exclude=False,
):
    """ """
    if not cols:
        cols = df.columns
    _df = df.select_dtypes(include=[np.number])
    _df = _df.loc[:, [i in cols for i in _df.columns]]
    low, high = np.min(quantile_select), np.max(quantile_select)
    if not logquantile:
        quantile = _df.quantile([low, high])
    else:
        quantile = _df.apply(np.log).quantile([low, high])
    whereout = (
        _df.apply(detect, args=(quantile, quantile_select), axis=0).sum(axis=1) > 0
    )
    if not exclude:
        whereout = np.logical_not(whereout)
    return _df.loc[whereout, :]


[docs]def concat_columns(df, columns=None, astype=str, **kwargs):
    """
    Concatenate strings across columns.

    Parameters
    -----------
    df : :class:`pandas.DataFrame`
        Dataframe to concatenate.
    columns : :class:`list`
        List of columns to concatenate.
    astype : :class:`type`
        Type to convert final concatenation to.

    Returns
    -------
    :class:`pandas.Series`
    """
    if columns is None:
        columns = df.columns
    kwargs = {**dict(dtype="object"), **kwargs}
    out = pd.Series(index=df.index, **kwargs)
    for ix, c in enumerate(columns):
        if ix == 0:
            out = df.loc[:, c].astype(astype)
        else:
            out += df.loc[:, c].astype(astype)
    return out


[docs]def uniques_from_concat(df, columns=None, hashit=True):
    """
    Creates ideally unique keys from multiple columns.
    Optionally hashes string to standardise length of identifier.

    Parameters
    ------------
    df : :class:`pandas.DataFrame`
        DataFrame to create indexes for.
    columns : :class:`list`
        Columns to use in the string concatenatation.
    hashit : :class:`bool`, :code:`True`
        Whether to use a hashing algorithm to create the key from a typically
        longer string.

    Returns
    ---------
    :class:`pandas.Series`
    """
    if columns is None:
        columns = df.columns

    if hashit:

        def fmt(ser):
            ser = ser.str.encode("UTF-8")
            ser = ser.apply(lambda x: hashlib.md5(x).hexdigest())
            return ser

    else:
        fmt = lambda x: x.str.encode("UTF-8")

    return fmt(concat_columns(df, columns, dtype="category"))


[docs]def df_from_csvs(csvs, dropna=True, ignore_index=False, **kwargs):
    """
    Takes a list of .csv filenames and converts to a single DataFrame.
    Combines columns across dataframes, preserving order of the first entered.

    E.g.
    SiO2, Al2O3, MgO, MnO, CaO
    SiO2, MgO, FeO, CaO
    SiO2, Na2O, Al2O3, FeO, CaO
    =>
    SiO2, Na2O, Al2O3, MgO, FeO, MnO, CaO
    - Existing neighbours take priority (i.e. FeO won't be inserted bf Al2O3)
    - Earlier inputs take priority (where ordering is ambiguous, place the earlier first)

    Todo
    ----
    Attempt to preserve column ordering across column sets, assuming
    they are generally in the same order but preserving only some of the
    information.
    """
    cols = []
    dfs = []
    for t in csvs:
        dfs.append(pd.read_csv(t, **kwargs))
        cols = cols + [i for i in dfs[-1].columns if i not in cols]

    df = accumulate(dfs, ignore_index=ignore_index)
    return df