Source code for smith_utils.text.normalization

import unicodedata


[docs] def normalize_text(text, ignore_case=True, remove_all_whitespace=False, nfkc=True): """ Normalizes the input text by applying transformations such as Unicode normalization, case folding, and whitespace handling. Parameters: text (str or None): The input text to normalize_text. If None, an empty string is returned. ignore_case (bool, optional): Whether to convert the text to lowercase. Defaults to True. remove_all_whitespace (bool, optional): Whether to remove all internal whitespace and trim outer whitespace. Defaults to False. nfkc (bool, optional): Whether to apply Unicode normalization using NFKC form. Defaults to True. Returns: str: The normalized text. """ if text is None: return "" # Cast to string to handle numeric cells safely text = str(text) # 1. Unicode Compatibility (Handles full-width/ligatures) if nfkc: text = unicodedata.normalize('NFKC', text) # 3. Whitespace handling # Always trim outer whitespace, and optionally remove all internal whitespace. text = text.strip() if remove_all_whitespace: text = "".join(text.split()) else: # Note: join(split()) reduces multiple spaces to one. text = " ".join(text.split()) # 2. Case Folding if ignore_case: text = text.lower() return text