import re
import textwrap
from string import ascii_lowercase
import numpy as np
from .log import Handle
logger = Handle(__name__)
try:
from sortedcollections import SortedSet as set
except ImportError:
pass
[docs]def to_width(multiline_string, width=79, **kwargs):
"""Uses builtin textwapr for text wrapping to a specific width."""
return textwrap.fill(multiline_string, width, **kwargs)
[docs]def normalise_whitespace(strg):
"""Substitutes extra tabs, newlines etc. for a single space."""
return re.sub(r"\s+", " ", strg).strip()
[docs]def remove_prefix(z, prefix):
"""Remove a specific prefix from the start of a string."""
if z.startswith(prefix):
return re.sub(r"^{}".format(prefix), "", z)
else:
return z
[docs]def remove_suffix(x, suffix=" "):
"""
Remove a specific suffix from the end of a string.
"""
if x.endswith(suffix):
x = x[: -len(suffix)]
return x
[docs]def quoted_string(s):
# if " " in s or '-' in s or '_' in s:
s = '''"{}"'''.format(s)
return s
[docs]def titlecase(
s,
exceptions=["and", "in", "a"],
abbrv=["ID", "IGSN", "CIA", "CIW", "PIA", "SAR", "SiTiIndex", "WIP"],
capitalize_first=True,
split_on=r"[\.\s_-]+",
delim="",
):
"""
Formats strings in CamelCase, with exceptions for simple articles
and omitted abbreviations which retain their capitalization.
Todo
-----
* Option for retaining original CamelCase.
"""
# Check if abbrv in string, in which case it'll need to be split first?
words = re.split(split_on, s)
out = []
first = words[0]
if capitalize_first and not (first in abbrv):
first = first.capitalize()
out.append(first)
for word in words[1:]:
if word in exceptions + abbrv:
pass
elif word.upper() in abbrv:
word = word.upper()
else:
word = word.capitalize()
out.append(word)
return delim.join(out)
[docs]def string_variations(
names,
preprocess=["lower", "strip"],
swaps=[(" ", "_"), (" ", "_"), ("-", " "), ("_", " "), ("-", ""), ("_", "")],
):
"""
Returns equilvaent string variations based on an input set of strings.
Parameters
----------
names: {list, str}
String or list of strings to generate name variations of.
preprocess: list
List of preprocessing string functions to apply before generating
variations.
swaps: list
List of tuples for str.replace(out, in).
Returns
--------
set
Set (or SortedSet, if sortedcontainers installed) of unique string
variations.
"""
vars = set()
# convert input to list if singular
if isinstance(names, str):
names = [names]
swapout = [s[0] for s in swaps]
for n in names:
n = str(n)
for p in preprocess:
n = getattr(n, p)()
vars.add(n)
if any([s in n for s in swapout]):
vars = vars.union([n.replace(*s) for s in swaps])
return vars
[docs]def parse_entry(
entry,
regex=r"(\s)*?(?P<value>[\.\w]+)(\s)*?",
delimiter=",",
values_only=True,
first_only=True,
errors=None,
replace_nan="None",
):
"""
Parses an arbitrary string data entry to return
values based on a regular expression containing
named fields including 'value' (and any others).
If the entry is of non-string type, this will
return the value (e.g. int, float, NaN, None).
Parameters
-----------------------
entry : :class:`str`
String entry which to search for the regex pattern.
regex : :class:`str`
Regular expression to compile and use to search the
entry for a value.
delimiter : :class:`str`, ::code:`','`
Optional delimiter to split the string in case of multiple
inclusion.
values_only : :class:`bool`, :code:`True`
Option to return only values (single or list), or to instead
return the dictionary corresponding to the matches.
first_only : :class:`bool`, :code:`True`
Option to return only the first match, or else all matches
errors
Error value to denote 'no match'. Not yet implemented.
"""
if isinstance(entry, str):
pattern = re.compile(regex)
matches = []
if not delimiter or (delimiter is None):
subparts = [entry]
else:
subparts = entry.split(delimiter)
for _l in subparts:
_m = pattern.match(_l)
if _m:
_d = dict(value=_m.group("value"))
# Add other groups
_d.update(
{
k: _m.group(k)
for (k, ind) in pattern.groupindex.items()
if not k == "value"
}
)
else:
_d = dict(value=replace_nan)
# Add other groups
_d.update(
{
k: replace_nan
for (k, ind) in pattern.groupindex.items()
if not k == "value"
}
)
matches.append(_d)
if values_only:
matches = [m["value"] for m in matches]
if first_only:
return matches[0]
return matches
else:
if entry is None:
entry = replace_nan
elif isinstance(entry, float):
if np.isnan(entry):
entry = replace_nan
if first_only:
return entry
else:
return [entry]
[docs]def split_records(data, delimiter=r"\r\n"):
"""
Splits records in a csv where quotation marks are used.
Splits on a delimiter followed by an even number of quotation marks.
"""
# https://stackoverflow.com/a/2787979
return re.split(delimiter + """(?=(?:[^'"]|'[^']*'|"[^"]*")*$)""", data)
[docs]def slugify(value, delim="-"):
"""
Normalizes a string, removes non-alpha characters, converts spaces to delimiters.
Parameters
-----------
value : :class:`str`
String to slugify.
delim : :class:`str`
Delimiter to replace whitespace with.
Returns
-------
:class:`str`
"""
value = re.sub(r"[^\w\s-]", "", value).strip()
value = re.sub(r"[-\s]+", delim, value)
return value
[docs]def int_to_alpha(num):
"""
Encode an integer into alpha characters, useful for sequences of axes/figures.
Parameters
----------
int : :class:`int`
Integer to encode.
Returns
-------
:class:`str`
Alpha-encoding of a small integer.
"""
remainder = num
text = []
if num >= 26:
major = remainder // 26
text.append(ascii_lowercase[remainder // 26 - 1])
remainder -= major * 26
text.append(ascii_lowercase[remainder])
return "".join(text)