Answer1 / app.py
ramysaidagieb's picture
Update app.py
66289a9 verified
raw
history blame
3.58 kB
import gradio as gr
import os
import tempfile
import pdfminer.high_level
import docx2txt
import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load Arabic embedding model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# FAISS index (vector store)
index = None
texts = []
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
return pdfminer.high_level.extract_text(pdf_path)
# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
return docx2txt.process(docx_path)
# Function to process uploaded files
def process_files(files, progress=gr.Progress()):
global index, texts
texts = []
temp_dir = tempfile.mkdtemp()
# Step 1: Extract text
progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
for file in files:
file_path = os.path.join(temp_dir, file.name)
with open(file_path, "wb") as f:
f.write(file.read())
if file.name.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file.name.endswith(".docx") or file.name.endswith(".doc"):
text = extract_text_from_docx(file_path)
else:
continue
texts.append(text)
# Step 2: Chunk the text
progress(0.4, desc="تقطيع النصوص إلى فقرات...")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
for text in texts:
chunks.extend(splitter.split_text(text))
# Step 3: Embed the text
progress(0.7, desc="تحويل الفقرات إلى متجهات...")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
# Step 4: Build FAISS index
progress(0.9, desc="بناء قاعدة بيانات البحث...")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
texts.clear()
texts.extend(chunks)
return "✅ النظام جاهز للإجابة على أسئلتك"
# Function to answer Arabic questions
def answer_question(question):
global index, texts
if index is None or len(texts) == 0:
return "❗ من فضلك قم بتحميل الكتب أولاً."
# Embed the question
question_embedding = embedding_model.encode([question])
# Search in FAISS
distances, indices = index.search(np.array(question_embedding), k=5)
retrieved_chunks = [texts[i] for i in indices[0]]
# Simple answer: concatenate most relevant chunks
answer = "\n".join(retrieved_chunks)
return answer
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
with gr.Tab("رفع الكتب"):
upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
train_button = gr.Button("ابدأ التدريب على الكتب")
training_output = gr.Textbox(label="حالة التدريب")
with gr.Tab("اسأل الكتب"):
question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
answer_output = gr.Textbox(label="الإجابة")
ask_button = gr.Button("أرسل السؤال")
train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])
demo.launch(share=True)