Spaces:

ramysaidagieb
/

rag22V1

Configuration error

rag22V1 / document_processor.py

Upload 6 files

74e2822 verified 3 months ago

961 Bytes

	import re
	import fitz # PyMuPDF
	from docx import Document
	from typing import List

	def clean_arabic_text(text: str) -> str:
	"""Normalize Arabic text and remove diacritics"""
	text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics
	text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
	return text.strip()

	def process_pdf(file_path: str) -> List[str]:
	"""Extract text from PDF"""
	doc = fitz.open(file_path)
	chunks = []
	for page in doc:
	text = page.get_text()
	cleaned = clean_arabic_text(text)
	if cleaned: chunks.append(cleaned)
	return chunks

	def process_docx(file_path: str) -> List[str]:
	"""Extract text from Word document"""
	doc = Document(file_path)
	chunks = []
	for para in doc.paragraphs:
	cleaned = clean_arabic_text(para.text)
	if cleaned: chunks.append(cleaned)
	return chunks