File size: 961 Bytes
74e2822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
import fitz  # PyMuPDF
from docx import Document
from typing import List

def clean_arabic_text(text: str) -> str:
    """Normalize Arabic text and remove diacritics"""
    text = re.sub(r'[\u064B-\u065F]', '', text)  # Remove diacritics
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
    return text.strip()

def process_pdf(file_path: str) -> List[str]:
    """Extract text from PDF"""
    doc = fitz.open(file_path)
    chunks = []
    for page in doc:
        text = page.get_text()
        cleaned = clean_arabic_text(text)
        if cleaned: chunks.append(cleaned)
    return chunks

def process_docx(file_path: str) -> List[str]:
    """Extract text from Word document"""
    doc = Document(file_path)
    chunks = []
    for para in doc.paragraphs:
        cleaned = clean_arabic_text(para.text)
        if cleaned: chunks.append(cleaned)
    return chunks