Spaces:
Sleeping
Sleeping
import os | |
import docx | |
import PyPDF2 | |
def extract_text_from_pdf(file): | |
text = "" | |
try: | |
pdf_reader = PyPDF2.PdfReader(file) | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
except Exception as e: | |
text += f"\n[خطأ في قراءة PDF: {e}]\n" | |
return text | |
def extract_text_from_docx(file): | |
doc = docx.Document(file) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def extract_text_from_txt(file): | |
return file.read().decode("utf-8") | |
def chunk_text(text, chunk_size=300, overlap=50): | |
words = text.split() | |
chunks = [] | |
start = 0 | |
while start < len(words): | |
end = min(start + chunk_size, len(words)) | |
chunk = " ".join(words[start:end]) | |
chunks.append(chunk) | |
start += chunk_size - overlap | |
return chunks | |
def process_documents(files, log_callback=None): | |
all_chunks = [] | |
for file in files: | |
filename = os.path.basename(file.name) | |
ext = filename.split(".")[-1].lower() | |
if log_callback: | |
log_callback(f"📁 معالجة الملف: {filename}") | |
try: | |
if ext == "pdf": | |
text = extract_text_from_pdf(file) | |
elif ext == "docx": | |
text = extract_text_from_docx(file) | |
elif ext == "txt": | |
text = extract_text_from_txt(file) | |
else: | |
if log_callback: | |
log_callback(f"❗️ تنسيق غير مدعوم: {ext}") | |
continue | |
chunks = chunk_text(text) | |
all_chunks.extend(chunks) | |
if log_callback: | |
log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}") | |
except Exception as e: | |
if log_callback: | |
log_callback(f"⚠️ فشل في معالجة {filename}: {e}") | |
return all_chunks | |