import re import fitz # PyMuPDF from docx import Document from typing import List def clean_arabic_text(text: str) -> str: """Normalize Arabic text and remove diacritics""" text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text) return text.strip() def process_pdf(file_path: str) -> List[str]: """Extract text from PDF""" doc = fitz.open(file_path) chunks = [] for page in doc: text = page.get_text() cleaned = clean_arabic_text(text) if cleaned: chunks.append(cleaned) return chunks def process_docx(file_path: str) -> List[str]: """Extract text from Word document""" doc = Document(file_path) chunks = [] for para in doc.paragraphs: cleaned = clean_arabic_text(para.text) if cleaned: chunks.append(cleaned) return chunks