rag22V1 / document_processor.py
ramysaidagieb's picture
Upload 6 files
74e2822 verified
raw
history blame
961 Bytes
import re
import fitz # PyMuPDF
from docx import Document
from typing import List
def clean_arabic_text(text: str) -> str:
"""Normalize Arabic text and remove diacritics"""
text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
return text.strip()
def process_pdf(file_path: str) -> List[str]:
"""Extract text from PDF"""
doc = fitz.open(file_path)
chunks = []
for page in doc:
text = page.get_text()
cleaned = clean_arabic_text(text)
if cleaned: chunks.append(cleaned)
return chunks
def process_docx(file_path: str) -> List[str]:
"""Extract text from Word document"""
doc = Document(file_path)
chunks = []
for para in doc.paragraphs:
cleaned = clean_arabic_text(para.text)
if cleaned: chunks.append(cleaned)
return chunks