Spaces:

ramysaidagieb
/

chat22GV2

Sleeping

chat22GV2 / utils.py

Update utils.py

848b322 verified 3 months ago

1.9 kB

	import os
	import docx
	import PyPDF2

	def extract_text_from_pdf(file):
	text = ""
	try:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	except Exception as e:
	text += f"\n[خطأ في قراءة PDF: {e}]\n"
	return text

	def extract_text_from_docx(file):
	doc = docx.Document(file)
	return "\n".join([para.text for para in doc.paragraphs])

	def extract_text_from_txt(file):
	return file.read().decode("utf-8")

	def chunk_text(text, chunk_size=300, overlap=50):
	words = text.split()
	chunks = []
	start = 0
	while start < len(words):
	end = min(start + chunk_size, len(words))
	chunk = " ".join(words[start:end])
	chunks.append(chunk)
	start += chunk_size - overlap
	return chunks

	def process_documents(files, log_callback=None):
	all_chunks = []
	for file in files:
	filename = os.path.basename(file.name)
	ext = filename.split(".")[-1].lower()
	if log_callback:
	log_callback(f"📁 معالجة الملف: {filename}")

	try:
	if ext == "pdf":
	text = extract_text_from_pdf(file)
	elif ext == "docx":
	text = extract_text_from_docx(file)
	elif ext == "txt":
	text = extract_text_from_txt(file)
	else:
	if log_callback:
	log_callback(f"❗️ تنسيق غير مدعوم: {ext}")
	continue

	chunks = chunk_text(text)
	all_chunks.extend(chunks)

	if log_callback:
	log_callback(f"✅ تم استخراج {len(chunks)} مقطع من {filename}")
	except Exception as e:
	if log_callback:
	log_callback(f"⚠️ فشل في معالجة {filename}: {e}")
	return all_chunks