ramysaidagieb commited on
Commit
1b4ee2d
·
verified ·
1 Parent(s): 5c518dd

Delete document_processor.py

Browse files
Files changed (1) hide show
  1. document_processor.py +0 -29
document_processor.py DELETED
@@ -1,29 +0,0 @@
1
- import re
2
- import fitz # PyMuPDF
3
- from docx import Document
4
- from typing import List
5
-
6
- def clean_arabic_text(text: str) -> str:
7
- """Normalize Arabic text and remove diacritics"""
8
- text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics
9
- text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
10
- return text.strip()
11
-
12
- def process_pdf(file_path: str) -> List[str]:
13
- """Extract text from PDF"""
14
- doc = fitz.open(file_path)
15
- chunks = []
16
- for page in doc:
17
- text = page.get_text()
18
- cleaned = clean_arabic_text(text)
19
- if cleaned: chunks.append(cleaned)
20
- return chunks
21
-
22
- def process_docx(file_path: str) -> List[str]:
23
- """Extract text from Word document"""
24
- doc = Document(file_path)
25
- chunks = []
26
- for para in doc.paragraphs:
27
- cleaned = clean_arabic_text(para.text)
28
- if cleaned: chunks.append(cleaned)
29
- return chunks