rag22v2 / utils.py
ramysaidagieb's picture
Upload 5 files
c90b40e verified
import fitz
from docx import Document
import re
import pyarabic.araby as araby
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
class DocumentProcessor:
def __init__(self, chunk_size=512, chunk_overlap=64):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "۔", ".", "؟", "!", "\n"]
)
def _normalize_arabic(self, text: str) -> str:
text = araby.strip_diacritics(text)
text = araby.normalize_ligatures(text)
text = araby.normalize_hamza(text)
return re.sub(r'\s+', ' ', text).strip()
def _process_pdf(self, file_path: str) -> List[Dict]:
doc = fitz.open(file_path)
pages = []
for page_num, page in enumerate(doc):
text = ""
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
if span["flags"] & 16: # Bold text
text += f"**{span['text']}** "
else:
text += span["text"] + " "
pages.append({
"text": self._normalize_arabic(text),
"source": file_path,
"page": page_num + 1
})
return pages
def _process_docx(self, file_path: str) -> List[Dict]:
doc = Document(file_path)
sections = []
current_section = {"text": "", "source": file_path, "page": 1}
for para in doc.paragraphs:
if para.style.name.startswith('Heading'):
if current_section["text"]:
sections.append(current_section)
current_section = {"text": "", "source": file_path, "page": len(sections)+1}
current_section["text"] += f"\n# {para.text}\n"
else:
current_section["text"] += para.text + "\n"
if current_section["text"]:
sections.append(current_section)
return [{
"text": self._normalize_arabic(s["text"]),
"source": s["source"],
"page": s["page"]
} for s in sections]
def process_documents(self, files: List) -> List[Dict]:
all_chunks = []
for file_info in files:
if file_info.name.endswith(".pdf"):
pages = self._process_pdf(file_info.name)
elif file_info.name.endswith(".docx"):
pages = self._process_docx(file_info.name)
else:
continue
for page in pages:
chunks = self.text_splitter.split_text(page["text"])
for chunk in chunks:
all_chunks.append({
"text": chunk,
"source": page["source"],
"page": page["page"]
})
return all_chunks