Source code for smith_utils.text.normalization
import unicodedata
[docs]
def normalize_text(text, ignore_case=True, remove_all_whitespace=False, nfkc=True):
"""
Normalizes the input text by applying transformations such as Unicode normalization,
case folding, and whitespace handling.
Parameters:
text (str or None): The input text to normalize_text. If None, an empty string is returned.
ignore_case (bool, optional): Whether to convert the text to lowercase. Defaults to True.
remove_all_whitespace (bool, optional): Whether to remove all internal whitespace and trim outer
whitespace. Defaults to False.
nfkc (bool, optional): Whether to apply Unicode normalization using NFKC form. Defaults to
True.
Returns:
str: The normalized text.
"""
if text is None:
return ""
# Cast to string to handle numeric cells safely
text = str(text)
# 1. Unicode Compatibility (Handles full-width/ligatures)
if nfkc:
text = unicodedata.normalize('NFKC', text)
# 3. Whitespace handling
# Always trim outer whitespace, and optionally remove all internal whitespace.
text = text.strip()
if remove_all_whitespace:
text = "".join(text.split())
else:
# Note: join(split()) reduces multiple spaces to one.
text = " ".join(text.split())
# 2. Case Folding
if ignore_case:
text = text.lower()
return text