File size: 4,221 Bytes
8876843
 
 
 
d28c712
 
8876843
d28c712
66289a9
8876843
d28c712
8876843
 
 
66289a9
 
8876843
66289a9
 
 
 
8876843
66289a9
 
8876843
 
66289a9
 
 
 
 
 
 
 
8876843
66289a9
 
 
8876843
66289a9
 
 
 
 
d28c712
66289a9
 
 
 
 
8876843
66289a9
 
d28c712
66289a9
d28c712
66289a9
8876843
66289a9
 
 
8876843
 
66289a9
8876843
66289a9
 
 
8876843
66289a9
 
8876843
 
66289a9
8876843
 
 
 
66289a9
8876843
 
66289a9
8876843
 
66289a9
8876843
 
 
 
 
 
 
 
 
 
 
 
66289a9
8876843
66289a9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Creating your fully corrected Hugging Face Space project

# app.py

import os
import tempfile
import gradio as gr
import faiss
import numpy as np
from transformers import AutoModel, AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text
from docx import Document

# Load Arabic embedding model
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
index = None
texts = []

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def process_files(files, progress=gr.Progress()):
    global index, texts
    texts = []

    temp_dir = tempfile.mkdtemp()

    # Step 1: Extract text
    progress(0.1, desc="\u062c\u0627\u0631\u0650 \u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0646\u0635\u0648\u0635 \u0645\u0646 \u0627\u0644\u0643\u062a\u0628...")
    for file in files:
        file_path = os.path.join(temp_dir, file.name)
        with open(file_path, "wb") as f:
            f.write(file.file.read())

        if file.name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file.name.endswith(".docx") or file.name.endswith(".doc"):
            text = extract_text_from_docx(file_path)
        else:
            continue

        texts.append(text)

    # Step 2: Chunk the text
    progress(0.4, desc="\u062a\u0642\u0637\u064a\u0639 \u0627\u0644\u0646\u0635\u0648\u0635 \u0625\u0644\u0649 \u0641\u0642\u0631\u0627\u062a...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = []
    for text in texts:
        chunks.extend(splitter.split_text(text))

    # Step 3: Embed the text
    progress(0.7, desc="\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0641\u0642\u0631\u0627\u062a \u0625\u0644\u0649 \u0645\u062a\u062c\u0647\u0627\u062a...")
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)

    # Step 4: Build FAISS index
    progress(0.9, desc="\u0628\u0646\u0627\u0621 \u0642\u0627\u0639\u062f\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0628\u062d\u062b...")
    embeddings = np.array(embeddings).astype(np.float32)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    texts.clear()
    texts.extend(chunks)

    return "\u2705 \u0627\u0644\u0646\u0638\u0627\u0645 \u062c\u0627\u0647\u0632 \u0644\u0644\u0625\u062c\u0627\u0628\u0629 \u0639\u0644\u0649 \u0623\u0633\u0626\u0644\u062a\u0643"

def answer_question(question):
    if index is None:
        return "\u064a\u0631\u062c\u0649 \u062a\u062d\u0645\u064a\u0644 \u0643\u062a\u0628 \u0648\u0627\u0644\u0646\u0642\u0631 \u0639\u0644\u0649 \"\u0627\u0628\u062f\u0623 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\" \u0623\u0648\u0644\u0627" 

    embedded_question = embedding_model.encode([question]).astype(np.float32)
    D, I = index.search(embedded_question, k=1)
    if len(I[0]) == 0:
        return "\u0644\u0645 \u064a\u062a\u0645 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0625\u062c\u0627\u0628\u0629."

    answer = texts[I[0][0]]
    return answer

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# \ud83d\udcda محاكاة دماغ المؤلف بناءً على الكتب المرفوعة")

    with gr.Row():
        files = gr.File(label="ارفع ملفات الكتب", file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
        upload_button = gr.Button("ابدأ التدريب على الكتب")

    output_text = gr.Textbox(label="مخرجات التدريب", interactive=False)

    upload_button.click(fn=process_files, inputs=[files], outputs=[output_text])

    gr.Markdown("## اطرح سؤالك بعد إكمال التدريب:")
    question = gr.Textbox(label="سؤالك بالعربية")
    answer = gr.Textbox(label="الإجابة", interactive=False)
    ask_button = gr.Button("أجب عن سؤالي")

    ask_button.click(fn=answer_question, inputs=[question], outputs=[answer])

demo.launch(share=True)