import os import docx import PyPDF2 def extract_text_from_pdf(file): text = "" try: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" except Exception as e: text += f"\n[خطأ في قراءة PDF: {e}]\n" return text def extract_text_from_docx(file): doc = docx.Document(file) return "\n".join([para.text for para in doc.paragraphs]) def extract_text_from_txt(file): return file.read().decode("utf-8") def chunk_text(text, chunk_size=300, overlap=50): words = text.split() chunks = [] start = 0 while start < len(words): end = min(start + chunk_size, len(words)) chunk = " ".join(words[start:end]) chunks.append(chunk) start += chunk_size - overlap return chunks def process_documents(files, log_callback=None): all_chunks = [] for file in files: filename = os.path.basename(file.name) ext = filename.split(".")[-1].lower() if log_callback: log_callback(f"📁 معالجة الملف: {filename}") try: if ext == "pdf": text = extract_text_from_pdf(file) elif ext == "docx": text = extract_text_from_docx(file) elif ext == "txt": text = extract_text_from_txt(file) else: if log_callback: log_callback(f"❗️ تنسيق غير مدعوم: {ext}") continue chunks = chunk_text(text) all_chunks.extend(chunks) if log_callback: log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}") except Exception as e: if log_callback: log_callback(f"⚠️ فشل في معالجة {filename}: {e}") return all_chunks