Spaces:

ramysaidagieb
/

Answer1

Sleeping

File size: 3,583 Bytes

import gradio as gr
import os
import tempfile
import pdfminer.high_level
import docx2txt
import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load Arabic embedding model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# FAISS index (vector store)
index = None
texts = []

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    return pdfminer.high_level.extract_text(pdf_path)

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    return docx2txt.process(docx_path)

# Function to process uploaded files
def process_files(files, progress=gr.Progress()):
    global index, texts
    texts = []

    temp_dir = tempfile.mkdtemp()

    # Step 1: Extract text
    progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
    for file in files:
        file_path = os.path.join(temp_dir, file.name)
        with open(file_path, "wb") as f:
            f.write(file.read())

        if file.name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file.name.endswith(".docx") or file.name.endswith(".doc"):
            text = extract_text_from_docx(file_path)
        else:
            continue

        texts.append(text)

    # Step 2: Chunk the text
    progress(0.4, desc="تقطيع النصوص إلى فقرات...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = []
    for text in texts:
        chunks.extend(splitter.split_text(text))

    # Step 3: Embed the text
    progress(0.7, desc="تحويل الفقرات إلى متجهات...")
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)

    # Step 4: Build FAISS index
    progress(0.9, desc="بناء قاعدة بيانات البحث...")
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    texts.clear()
    texts.extend(chunks)

    return "✅ النظام جاهز للإجابة على أسئلتك"

# Function to answer Arabic questions
def answer_question(question):
    global index, texts

    if index is None or len(texts) == 0:
        return "❗ من فضلك قم بتحميل الكتب أولاً."

    # Embed the question
    question_embedding = embedding_model.encode([question])

    # Search in FAISS
    distances, indices = index.search(np.array(question_embedding), k=5)
    retrieved_chunks = [texts[i] for i in indices[0]]

    # Simple answer: concatenate most relevant chunks
    answer = "\n".join(retrieved_chunks)
    return answer

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
    
    with gr.Tab("رفع الكتب"):
        upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
        train_button = gr.Button("ابدأ التدريب على الكتب")
        training_output = gr.Textbox(label="حالة التدريب")

    with gr.Tab("اسأل الكتب"):
        question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
        answer_output = gr.Textbox(label="الإجابة")
        ask_button = gr.Button("أرسل السؤال")

    train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
    ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])

demo.launch(share=True)