Source code for pyrolite.util.text

import re
import textwrap
from string import ascii_lowercase

import numpy as np

from .log import Handle

logger = Handle(__name__)

try:
    from sortedcollections import SortedSet as set
except ImportError:
    pass


[docs]def to_width(multiline_string, width=79, **kwargs): """Uses builtin textwapr for text wrapping to a specific width.""" return textwrap.fill(multiline_string, width, **kwargs)
[docs]def normalise_whitespace(strg): """Substitutes extra tabs, newlines etc. for a single space.""" return re.sub(r"\s+", " ", strg).strip()
[docs]def remove_prefix(z, prefix): """Remove a specific prefix from the start of a string.""" if z.startswith(prefix): return re.sub(r"^{}".format(prefix), "", z) else: return z
[docs]def remove_suffix(x, suffix=" "): """ Remove a specific suffix from the end of a string. """ if x.endswith(suffix): x = x[: -len(suffix)] return x
[docs]def quoted_string(s): # if " " in s or '-' in s or '_' in s: s = '''"{}"'''.format(s) return s
[docs]def titlecase( s, exceptions=["and", "in", "a"], abbrv=["ID", "IGSN", "CIA", "CIW", "PIA", "SAR", "SiTiIndex", "WIP"], capitalize_first=True, split_on=r"[\.\s_-]+", delim="", ): """ Formats strings in CamelCase, with exceptions for simple articles and omitted abbreviations which retain their capitalization. Todo ----- * Option for retaining original CamelCase. """ # Check if abbrv in string, in which case it'll need to be split first? words = re.split(split_on, s) out = [] first = words[0] if capitalize_first and not (first in abbrv): first = first.capitalize() out.append(first) for word in words[1:]: if word in exceptions + abbrv: pass elif word.upper() in abbrv: word = word.upper() else: word = word.capitalize() out.append(word) return delim.join(out)
[docs]def string_variations( names, preprocess=["lower", "strip"], swaps=[(" ", "_"), (" ", "_"), ("-", " "), ("_", " "), ("-", ""), ("_", "")], ): """ Returns equilvaent string variations based on an input set of strings. Parameters ---------- names: {list, str} String or list of strings to generate name variations of. preprocess: list List of preprocessing string functions to apply before generating variations. swaps: list List of tuples for str.replace(out, in). Returns -------- set Set (or SortedSet, if sortedcontainers installed) of unique string variations. """ vars = set() # convert input to list if singular if isinstance(names, str): names = [names] swapout = [s[0] for s in swaps] for n in names: n = str(n) for p in preprocess: n = getattr(n, p)() vars.add(n) if any([s in n for s in swapout]): vars = vars.union([n.replace(*s) for s in swaps]) return vars
[docs]def parse_entry( entry, regex=r"(\s)*?(?P<value>[\.\w]+)(\s)*?", delimiter=",", values_only=True, first_only=True, errors=None, replace_nan="None", ): """ Parses an arbitrary string data entry to return values based on a regular expression containing named fields including 'value' (and any others). If the entry is of non-string type, this will return the value (e.g. int, float, NaN, None). Parameters ----------------------- entry : :class:`str` String entry which to search for the regex pattern. regex : :class:`str` Regular expression to compile and use to search the entry for a value. delimiter : :class:`str`, ::code:`','` Optional delimiter to split the string in case of multiple inclusion. values_only : :class:`bool`, :code:`True` Option to return only values (single or list), or to instead return the dictionary corresponding to the matches. first_only : :class:`bool`, :code:`True` Option to return only the first match, or else all matches errors Error value to denote 'no match'. Not yet implemented. """ if isinstance(entry, str): pattern = re.compile(regex) matches = [] if not delimiter or (delimiter is None): subparts = [entry] else: subparts = entry.split(delimiter) for _l in subparts: _m = pattern.match(_l) if _m: _d = dict(value=_m.group("value")) # Add other groups _d.update( { k: _m.group(k) for (k, ind) in pattern.groupindex.items() if not k == "value" } ) else: _d = dict(value=replace_nan) # Add other groups _d.update( { k: replace_nan for (k, ind) in pattern.groupindex.items() if not k == "value" } ) matches.append(_d) if values_only: matches = [m["value"] for m in matches] if first_only: return matches[0] return matches else: if entry is None: entry = replace_nan elif isinstance(entry, float): if np.isnan(entry): entry = replace_nan if first_only: return entry else: return [entry]
[docs]def split_records(data, delimiter=r"\r\n"): """ Splits records in a csv where quotation marks are used. Splits on a delimiter followed by an even number of quotation marks. """ # https://stackoverflow.com/a/2787979 return re.split(delimiter + """(?=(?:[^'"]|'[^']*'|"[^"]*")*$)""", data)
[docs]def slugify(value, delim="-"): """ Normalizes a string, removes non-alpha characters, converts spaces to delimiters. Parameters ----------- value : :class:`str` String to slugify. delim : :class:`str` Delimiter to replace whitespace with. Returns ------- :class:`str` """ value = re.sub(r"[^\w\s-]", "", value).strip() value = re.sub(r"[-\s]+", delim, value) return value
[docs]def int_to_alpha(num): """ Encode an integer into alpha characters, useful for sequences of axes/figures. Parameters ---------- int : :class:`int` Integer to encode. Returns ------- :class:`str` Alpha-encoding of a small integer. """ remainder = num text = [] if num >= 26: major = remainder // 26 text.append(ascii_lowercase[remainder // 26 - 1]) remainder -= major * 26 text.append(ascii_lowercase[remainder]) return "".join(text)