Allanatrix's picture
Upload 50 files
ef4c8c3 verified
raw
history blame contribute delete
514 Bytes
import re
import unicodedata
def clean_text(text: str) -> str:
"""Clean and normalize text for LLM ingestion."""
if not isinstance(text, str):
return ""
# Normalize unicode
text = unicodedata.normalize("NFKC", text)
# Remove control characters
text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
# Replace multiple spaces/newlines with a single space
text = re.sub(r"\s+", " ", text)
# Strip leading/trailing whitespace
text = text.strip()
return text