Source code for pyrolite.util.skl.pipeline

from pathlib import Path

import joblib
import pandas as pd

from ..log import Handle
from ..meta import get_additional_params
from ..plot import save_figure

logger = Handle(__name__)

try:
    import sklearn.svm
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection import GridSearchCV, StratifiedKFold
except ImportError:
    msg = "scikit-learn not installed"
    logger.warning(msg)

try:
    from imblearn.pipeline import make_pipeline
except ImportError:
    msg = "imbalanced-learn not installed"
    logger.warning(msg)
    from sklearn.pipeline import make_pipeline  # fallback to default skl

from .vis import plot_confusion_matrix, plot_gs_results


[docs]def fit_save_classifier( clf, X_train, y_train, directory=".", name="clf", extension=".joblib" ): """ Fit and save a classifier model. Also save relevant metadata where possible. Parameters ----------- clf : :class:`sklearn.base.BaseEstimator` Classifier or gridsearch. X_train : :class:`numpy.ndarray` | :class:`pandas.DataFrame` Training data. y_train : :class:`numpy.ndarray` | :class:`pandas.Series` Training true classes. directory : :class:`str` | :class:`pathlib.Path` Path to the save directory. name : :class:`str` Name of the classifier. extension : :class:`str` Extension to give the saved classifier pickled witih joblib. Returns -------- clf : :class:`sklearn.base.BaseEstimator` Fitted classifier. """ clf_dir = Path(directory) / name if not clf_dir.exists(): clf_dir.mkdir(parents=True) clf.fit(X_train, y_train) fpath = (clf_dir / name).with_suffix(extension) # save metadata if isinstance(X_train, pd.DataFrame): # save the features used in the model for ref components = [str(i) for i in X_train.columns] with open( str(clf_dir / "{}_features.txt".format(name)), "w", encoding="utf-8" ) as fp: fp.write(",".join(components)) _ = joblib.dump(clf, str(fpath), compress=9) return clf
[docs]def classifier_performance_report(clf, X_test, y_test, classes=[], directory=".", name="clf"): """ Output a performance report for a classifier. Currently outputs the overall classification score, a confusion matrix and where relevant an indication of variation seen across the gridsearch (currently only possible for 2D searches). Parameters ---------- clf : :class:`sklearn.base.BaseEstimator` | `sklearn.model_selection.GridSearchCV` Classifer or gridsearch. X_test : :class:`numpy.ndarray` | :class:`pandas.DataFrame` Input data for testing. y_test : :class:`numpy.ndarray` | :class:`pandas.Series` Labelled/target data for testing. classes : list Names of classes. directory : :class:`str` | :class:`pathlib.Path` Path to the save directory. name : :class:`str` Name of the classifier. Returns -------- clf : :class:`sklearn.base.BaseEstimator` Fitted classifier. """ clf_dir = Path(directory) / name if not clf_dir.exists(): clf_dir.mkdir(parents=True) if isinstance(clf, GridSearchCV): gs = True gs = clf params = gs.best_params_ clf = gs.best_estimator_ score = clf.score(X_test, y_test) with open(str(clf_dir / "scores_{}.txt".format(name)), "a") as fp: line = "Score: {:01.3g}".format(score) if gs: # add the gridsearch parameters line += "\t{}\n".format( "\t".join(["{}:{:01.2g}".format(k, v) for k, v in params.items()]) ) fp.write(line) cmax = plot_confusion_matrix(clf, X_test, y_test, normalize=True, classes=classes) save_figure(cmax.figure, save_at=clf_dir, name="confusion_matrix_{}".format(name)) try: gsax = plot_gs_results(gs) save_figure( gsax.figure, save_at=clf_dir, name="gridsearchresults_{}".format(name) ) except ValueError: # only one param changed in gridsearch pass return clf
[docs]def SVC_pipeline( sampler=None, balance=True, transform=None, scaler=None, kernel="rbf", decision_function_shape="ovo", probability=False, cv=StratifiedKFold(n_splits=10, shuffle=True), param_grid={}, n_jobs=4, verbose=10, cache_size=500, **kwargs ): """ A convenience function for constructing a Support Vector Classifier pipeline. Parameters ----------- sampler : :class:`sklearn.base.TransformerMixin` Resampling transformer. balance : :class:`bool` Whether to balance the class weights for the classifier. transform : :class:`sklearn.base.TransformerMixin` Preprocessing transformer. scaler : :class:`sklearn.base.TransformerMixin` Scale transformer. kernel : :class:`str` | :class:`callable` Name of kernel to use for the support vector classifier (:code:`'linear'|'rbf'|'poly'|'sigmoid'`). Optionally, a custom kernel function can be supplied (see :mod:`sklearn` docs for more info). decision_function_shape : :class:`str`, :code:`'ovo' or 'ovr'` Shape of the decision function surface. :code:`'ovo'` one-vs-one classifier of libsvm (returning classification of shape :code:`(samples, classes*(classes-1)/2))`, or the default :code:`'ovr' one-vs-rest classifier which will return classification estimation shape of :code:`(samples, classes)`. probability : :class:`bool` Whether to implement Platt-scaling to enable probability estimates. This must be enabled prior to calling fit, and will slow down that method. cv : :class:`int` | :class:`sklearn.model_selection.BaseSearchCV` Cross validation search. If an integer :code:`k` is provided, results in default :code:`k`-fold cross validation. Optionally, if a :class:`sklearn.model_selection.BaseSearchCV` instance is provided, it will be used directly (enabling finer control, e.g. over sorting/shuffling etc). param_grid : :class:`dict` Dictionary reprenting a parameter grid for the support vector classifier. Typically contains 1D arrays of grid indicies for :func:`~sklearn.svm.SVC` parameters each prefixed with :code:`svc__` (e.g. :code:`dict(svc__gamma=np.logspace(-1, 3, 5), svc__C=np.logspace(-0.5, 2, 5))`. n_jobs : :class:`int` Number of processors to use for the SVC construction. Note that providing :code:`n_jobs = -1` will use all available processors. verbose : :class:`int` Level of verbosity for the pipeline logging output. cache_size : :class:`float` Specify the size of the kernel cache (in MB). {otherparams} Returns ------- gs : :class:`sklearn.model_selection.GridSearchCV` Gridsearch object containing the results of the SVC training across the parameter grid. Access the best estimator with :code:`gs.best_estimator_` and its parameters with :code:`gs.best_params_`. """ classifier_kwargs = { "kernel": kernel, "probability": probability, "decision_function_shape": decision_function_shape, "cache_size": cache_size, "gamma": "scale", # suppress warnings; 'auto' deprecated, likely changes with gs **kwargs, } if balance: classifier_kwargs.update(dict(class_weight="balanced")) stages = [] if sampler is not None: stages.append(sampler) if transform is not None: stages.append(transform) if scaler is not None: # scaler should be the second last item added stages.append(scaler) stages.append(sklearn.svm.SVC(**classifier_kwargs)) # add the classifier itself pipe = make_pipeline(*stages) gs = GridSearchCV( estimator=pipe, param_grid=param_grid, cv=cv, n_jobs=n_jobs, verbose=verbose ) return gs
[docs]class PdUnion(BaseEstimator, TransformerMixin): def __init__(self, estimators: list = []): self.estimators = estimators
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X): assert isinstance(X, pd.DataFrame) parts = [] for est in self.estimators: if isinstance(est, pd.DataFrame): parts.append(est) elif isinstance(est, TransformerMixin) or isinstance(est, BaseEstimator): if hasattr(est, "fit"): parts.append(est.fit_transform(X)) else: parts.append(est.transform(X)) else: # e.g. Numpy array, try to convert to dataframe parts.append(pd.DataFrame(est)) columns = [] idxs = [] for p in parts: columns += [i for i in p.columns if not i in columns] idxs.append(p.index.size) # check the indexes are all the same length assert all([idx == idxs[0] for idx in idxs]) out = pd.DataFrame(columns=columns) for p in parts: out[p.columns] = p return out
_add_additional_parameters = True SVC_pipeline.__doc__ = SVC_pipeline.__doc__.format( otherparams=[ "", get_additional_params( SVC_pipeline, sklearn.svm.SVC, indent=4, header="Other Parameters", subsections=True, ), ][_add_additional_parameters] )