File size: 3,583 Bytes
d28c712
 
 
66289a9
 
d28c712
66289a9
 
 
d28c712
66289a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d28c712
66289a9
 
 
 
 
 
 
 
d28c712
66289a9
d28c712
66289a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d28c712
66289a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
import os
import tempfile
import pdfminer.high_level
import docx2txt
import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load Arabic embedding model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# FAISS index (vector store)
index = None
texts = []

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    return pdfminer.high_level.extract_text(pdf_path)

# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
    return docx2txt.process(docx_path)

# Function to process uploaded files
def process_files(files, progress=gr.Progress()):
    global index, texts
    texts = []

    temp_dir = tempfile.mkdtemp()

    # Step 1: Extract text
    progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
    for file in files:
        file_path = os.path.join(temp_dir, file.name)
        with open(file_path, "wb") as f:
            f.write(file.read())

        if file.name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file.name.endswith(".docx") or file.name.endswith(".doc"):
            text = extract_text_from_docx(file_path)
        else:
            continue

        texts.append(text)

    # Step 2: Chunk the text
    progress(0.4, desc="تقطيع النصوص إلى فقرات...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = []
    for text in texts:
        chunks.extend(splitter.split_text(text))

    # Step 3: Embed the text
    progress(0.7, desc="تحويل الفقرات إلى متجهات...")
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)

    # Step 4: Build FAISS index
    progress(0.9, desc="بناء قاعدة بيانات البحث...")
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    texts.clear()
    texts.extend(chunks)

    return "✅ النظام جاهز للإجابة على أسئلتك"

# Function to answer Arabic questions
def answer_question(question):
    global index, texts

    if index is None or len(texts) == 0:
        return "❗ من فضلك قم بتحميل الكتب أولاً."

    # Embed the question
    question_embedding = embedding_model.encode([question])

    # Search in FAISS
    distances, indices = index.search(np.array(question_embedding), k=5)
    retrieved_chunks = [texts[i] for i in indices[0]]

    # Simple answer: concatenate most relevant chunks
    answer = "\n".join(retrieved_chunks)
    return answer

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
    
    with gr.Tab("رفع الكتب"):
        upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
        train_button = gr.Button("ابدأ التدريب على الكتب")
        training_output = gr.Textbox(label="حالة التدريب")

    with gr.Tab("اسأل الكتب"):
        question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
        answer_output = gr.Textbox(label="الإجابة")
        ask_button = gr.Button("أرسل السؤال")

    train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
    ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])

demo.launch(share=True)