Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import tempfile | |
import pdfminer.high_level | |
import docx2txt | |
import faiss | |
import numpy as np | |
from tqdm import tqdm | |
from sentence_transformers import SentenceTransformer | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# Load Arabic embedding model | |
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
# FAISS index (vector store) | |
index = None | |
texts = [] | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_path): | |
return pdfminer.high_level.extract_text(pdf_path) | |
# Function to extract text from DOCX | |
def extract_text_from_docx(docx_path): | |
return docx2txt.process(docx_path) | |
# Function to process uploaded files | |
def process_files(files, progress=gr.Progress()): | |
global index, texts | |
texts = [] | |
temp_dir = tempfile.mkdtemp() | |
# Step 1: Extract text | |
progress(0.1, desc="جارٍ استخراج النصوص من الكتب...") | |
for file in files: | |
file_path = os.path.join(temp_dir, file.name) | |
with open(file_path, "wb") as f: | |
f.write(file.read()) | |
if file.name.endswith(".pdf"): | |
text = extract_text_from_pdf(file_path) | |
elif file.name.endswith(".docx") or file.name.endswith(".doc"): | |
text = extract_text_from_docx(file_path) | |
else: | |
continue | |
texts.append(text) | |
# Step 2: Chunk the text | |
progress(0.4, desc="تقطيع النصوص إلى فقرات...") | |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
chunks = [] | |
for text in texts: | |
chunks.extend(splitter.split_text(text)) | |
# Step 3: Embed the text | |
progress(0.7, desc="تحويل الفقرات إلى متجهات...") | |
embeddings = embedding_model.encode(chunks, show_progress_bar=True) | |
# Step 4: Build FAISS index | |
progress(0.9, desc="بناء قاعدة بيانات البحث...") | |
index = faiss.IndexFlatL2(embeddings.shape[1]) | |
index.add(np.array(embeddings)) | |
texts.clear() | |
texts.extend(chunks) | |
return "✅ النظام جاهز للإجابة على أسئلتك" | |
# Function to answer Arabic questions | |
def answer_question(question): | |
global index, texts | |
if index is None or len(texts) == 0: | |
return "❗ من فضلك قم بتحميل الكتب أولاً." | |
# Embed the question | |
question_embedding = embedding_model.encode([question]) | |
# Search in FAISS | |
distances, indices = index.search(np.array(question_embedding), k=5) | |
retrieved_chunks = [texts[i] for i in indices[0]] | |
# Simple answer: concatenate most relevant chunks | |
answer = "\n".join(retrieved_chunks) | |
return answer | |
# Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI") | |
with gr.Tab("رفع الكتب"): | |
upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple") | |
train_button = gr.Button("ابدأ التدريب على الكتب") | |
training_output = gr.Textbox(label="حالة التدريب") | |
with gr.Tab("اسأل الكتب"): | |
question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية") | |
answer_output = gr.Textbox(label="الإجابة") | |
ask_button = gr.Button("أرسل السؤال") | |
train_button.click(fn=process_files, inputs=[upload], outputs=[training_output]) | |
ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output]) | |
demo.launch(share=True) | |