Spaces:
Running
Running
import re | |
import unicodedata | |
def clean_text(text: str) -> str: | |
"""Clean and normalize text for LLM ingestion.""" | |
if not isinstance(text, str): | |
return "" | |
# Normalize unicode | |
text = unicodedata.normalize("NFKC", text) | |
# Remove control characters | |
text = re.sub(r"[\x00-\x1F\x7F]", " ", text) | |
# Replace multiple spaces/newlines with a single space | |
text = re.sub(r"\s+", " ", text) | |
# Strip leading/trailing whitespace | |
text = text.strip() | |
return text | |