Spaces:

ramysaidagieb
/

rag22V1

Configuration error

ramysaidagieb commited on May 22

Commit

1b4ee2d

verified ·

1 Parent(s): 5c518dd

Delete document_processor.py

Files changed (1) hide show

document_processor.py DELETED Viewed

@@ -1,29 +0,0 @@
-import re
-import fitz  # PyMuPDF
-from docx import Document
-from typing import List
-def clean_arabic_text(text: str) -> str:
-    """Normalize Arabic text and remove diacritics"""
-    text = re.sub(r'[\u064B-\u065F]', '', text)  # Remove diacritics
-    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
-    return text.strip()
-def process_pdf(file_path: str) -> List[str]:
-    """Extract text from PDF"""
-    doc = fitz.open(file_path)
-    chunks = []
-    for page in doc:
-        text = page.get_text()
-        cleaned = clean_arabic_text(text)
-        if cleaned: chunks.append(cleaned)
-    return chunks
-def process_docx(file_path: str) -> List[str]:
-    """Extract text from Word document"""
-    doc = Document(file_path)
-    chunks = []
-    for para in doc.paragraphs:
-        cleaned = clean_arabic_text(para.text)
-        if cleaned: chunks.append(cleaned)
-    return chunks