Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

Answer1 / app.py

ramysaidagieb

Update app.py

66289a9 verified 4 months ago

raw

history blame

3.58 kB

	import gradio as gr
	import os
	import tempfile
	import pdfminer.high_level
	import docx2txt
	import faiss
	import numpy as np
	from tqdm import tqdm
	from sentence_transformers import SentenceTransformer
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Load Arabic embedding model
	embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

	# FAISS index (vector store)
	index = None
	texts = []

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_path):
	return pdfminer.high_level.extract_text(pdf_path)

	# Function to extract text from DOCX
	def extract_text_from_docx(docx_path):
	return docx2txt.process(docx_path)

	# Function to process uploaded files
	def process_files(files, progress=gr.Progress()):
	global index, texts
	texts = []

	temp_dir = tempfile.mkdtemp()

	# Step 1: Extract text
	progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
	for file in files:
	file_path = os.path.join(temp_dir, file.name)
	with open(file_path, "wb") as f:
	f.write(file.read())

	if file.name.endswith(".pdf"):
	text = extract_text_from_pdf(file_path)
	elif file.name.endswith(".docx") or file.name.endswith(".doc"):
	text = extract_text_from_docx(file_path)
	else:
	continue

	texts.append(text)

	# Step 2: Chunk the text
	progress(0.4, desc="تقطيع النصوص إلى فقرات...")
	splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	chunks = []
	for text in texts:
	chunks.extend(splitter.split_text(text))

	# Step 3: Embed the text
	progress(0.7, desc="تحويل الفقرات إلى متجهات...")
	embeddings = embedding_model.encode(chunks, show_progress_bar=True)

	# Step 4: Build FAISS index
	progress(0.9, desc="بناء قاعدة بيانات البحث...")
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(np.array(embeddings))
	texts.clear()
	texts.extend(chunks)

	return "✅ النظام جاهز للإجابة على أسئلتك"

	# Function to answer Arabic questions
	def answer_question(question):
	global index, texts

	if index is None or len(texts) == 0:
	return "❗ من فضلك قم بتحميل الكتب أولاً."

	# Embed the question
	question_embedding = embedding_model.encode([question])

	# Search in FAISS
	distances, indices = index.search(np.array(question_embedding), k=5)
	retrieved_chunks = [texts[i] for i in indices[0]]

	# Simple answer: concatenate most relevant chunks
	answer = "\n".join(retrieved_chunks)
	return answer

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")

	with gr.Tab("رفع الكتب"):
	upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
	train_button = gr.Button("ابدأ التدريب على الكتب")
	training_output = gr.Textbox(label="حالة التدريب")

	with gr.Tab("اسأل الكتب"):
	question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
	answer_output = gr.Textbox(label="الإجابة")
	ask_button = gr.Button("أرسل السؤال")

	train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
	ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])

	demo.launch(share=True)