Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

Answer1 / app.py

ramysaidagieb

Update app.py

5ef2861 verified 4 months ago

raw

history blame

4.59 kB

	import gradio as gr
	import tempfile
	import os
	import faiss
	import numpy as np
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from pdfminer.high_level import extract_text
	import docx

	# Initialize global variables
	embedding_model = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-mix')
	index = None
	texts = []

	def extract_text_from_pdf(file_path):
	try:
	return extract_text(file_path)
	except Exception as e:
	print(f"Error extracting from PDF: {e}")
	return ""

	def extract_text_from_docx(file_path):
	try:
	doc = docx.Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	print(f"Error extracting from DOCX: {e}")
	return ""

	def process_files(files, progress=gr.Progress()):
	global index, texts

	if not files or len(files) == 0:
	return "⚠️ لم يتم رفع أي ملفات. الرجاء رفع كتاب واحد على الأقل."

	texts = []
	temp_dir = tempfile.mkdtemp()

	try:
	# Step 1: Extract text
	progress(0.1, desc="جاري استخراج النصوص من الكتب...")
	for file in files:
	file_path = os.path.join(temp_dir, file.name)
	with open(file_path, "wb") as f:
	f.write(file.file.read())

	if file.name.endswith(".pdf"):
	text = extract_text_from_pdf(file_path)
	elif file.name.endswith(".docx") or file.name.endswith(".doc"):
	text = extract_text_from_docx(file_path)
	else:
	continue

	if text:
	texts.append(text)

	if len(texts) == 0:
	return "⚠️ لم يتم استخراج نصوص صالحة من الملفات."

	# Step 2: Chunk the text
	progress(0.4, desc="تقطيع النصوص إلى فقرات...")
	splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	chunks = []
	for text in texts:
	chunks.extend(splitter.split_text(text))

	if len(chunks) == 0:
	return "⚠️ لا يوجد محتوى نصي كافٍ للتدريب."

	# Step 3: Embed the text
	progress(0.7, desc="تحويل الفقرات إلى متجهات...")
	embeddings = embedding_model.encode(chunks, show_progress_bar=True)

	# Step 4: Build FAISS index
	progress(0.9, desc="بناء قاعدة بيانات البحث...")
	embeddings = np.array(embeddings).astype(np.float32)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings)
	texts.clear()
	texts.extend(chunks)

	return "✅ النظام جاهز للإجابة على أسئلتك"
	except Exception as e:
	return f"❌ حدث خطأ أثناء التدريب: {str(e)}"

	def answer_question(question):
	global index, texts

	if index is None or len(texts) == 0:
	return "⚠️ الرجاء رفع كتبك وتدريب النظام أولاً."

	try:
	question_embedding = embedding_model.encode([question])
	question_embedding = np.array(question_embedding).astype(np.float32)

	D, I = index.search(question_embedding, k=1)
	if I[0][0] == -1:
	return "❌ لم يتم العثور على إجابة."

	retrieved_chunk = texts[I[0][0]]
	return retrieved_chunk
	except Exception as e:
	return f"❌ حدث خطأ أثناء الإجابة: {str(e)}"

	with gr.Blocks() as demo:
	gr.Markdown("# 📚 نظام محاكاة دماغ المؤلف العربي\nرفع كتبك ودرب النظام للإجابة على أسئلتك باللغة العربية فقط.")

	with gr.Row():
	file_input = gr.File(label="📄 ارفع ملفات الكتب (PDF أو DOCX)", file_types=['.pdf', '.docx', '.doc'], file_count="multiple")

	with gr.Row():
	train_button = gr.Button("🚀 ابدأ التدريب على الكتب")

	output_text = gr.Textbox(label="🔵 حالة التدريب")

	with gr.Row():
	question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا")
	answer_output = gr.Textbox(label="🧠 إجابة النظام")

	train_button.click(fn=process_files, inputs=[file_input], outputs=[output_text])
	question_input.submit(fn=answer_question, inputs=[question_input], outputs=[answer_output])

	demo.launch()