Spaces:
Configuration error
Configuration error
import re | |
import fitz # PyMuPDF | |
from docx import Document | |
from typing import List | |
def clean_arabic_text(text: str) -> str: | |
"""Normalize Arabic text and remove diacritics""" | |
text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics | |
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text) | |
return text.strip() | |
def process_pdf(file_path: str) -> List[str]: | |
"""Extract text from PDF""" | |
doc = fitz.open(file_path) | |
chunks = [] | |
for page in doc: | |
text = page.get_text() | |
cleaned = clean_arabic_text(text) | |
if cleaned: chunks.append(cleaned) | |
return chunks | |
def process_docx(file_path: str) -> List[str]: | |
"""Extract text from Word document""" | |
doc = Document(file_path) | |
chunks = [] | |
for para in doc.paragraphs: | |
cleaned = clean_arabic_text(para.text) | |
if cleaned: chunks.append(cleaned) | |
return chunks | |