Spaces:
Configuration error
Configuration error
Delete document_processor.py
Browse files- document_processor.py +0 -29
document_processor.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import fitz # PyMuPDF
|
3 |
-
from docx import Document
|
4 |
-
from typing import List
|
5 |
-
|
6 |
-
def clean_arabic_text(text: str) -> str:
|
7 |
-
"""Normalize Arabic text and remove diacritics"""
|
8 |
-
text = re.sub(r'[\u064B-\u065F]', '', text) # Remove diacritics
|
9 |
-
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
|
10 |
-
return text.strip()
|
11 |
-
|
12 |
-
def process_pdf(file_path: str) -> List[str]:
|
13 |
-
"""Extract text from PDF"""
|
14 |
-
doc = fitz.open(file_path)
|
15 |
-
chunks = []
|
16 |
-
for page in doc:
|
17 |
-
text = page.get_text()
|
18 |
-
cleaned = clean_arabic_text(text)
|
19 |
-
if cleaned: chunks.append(cleaned)
|
20 |
-
return chunks
|
21 |
-
|
22 |
-
def process_docx(file_path: str) -> List[str]:
|
23 |
-
"""Extract text from Word document"""
|
24 |
-
doc = Document(file_path)
|
25 |
-
chunks = []
|
26 |
-
for para in doc.paragraphs:
|
27 |
-
cleaned = clean_arabic_text(para.text)
|
28 |
-
if cleaned: chunks.append(cleaned)
|
29 |
-
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|