File size: 4,256 Bytes
d28c712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import os
import tempfile
import faiss
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from pdfminer.high_level import extract_text as extract_pdf_text
import docx
import nltk

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

uploaded_texts = []
vector_store = None
qa_chain = None

embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

model_name = "csebuetnlp/mT5_small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)

ARABIC_PROMPT_TEMPLATE = """
أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
لا تستخدم أي معلومات خارجية.
السؤال: {question}
الإجابة:
"""

def format_arabic_prompt(question):
    return ARABIC_PROMPT_TEMPLATE.format(question=question)

def extract_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        return extract_pdf_text(file_path)
    elif file_path.endswith(".docx") or file_path.endswith(".doc"):
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")

def arabic_split_text(text):
    sentences = sent_tokenize(text, language='arabic')
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= 500:
            chunk += " " + sentence
        else:
            chunks.append(chunk.strip())
            chunk = sentence
    if chunk:
        chunks.append(chunk.strip())
    return chunks

def train_from_texts(texts):
    global vector_store, qa_chain

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
    )
    
    all_chunks = []
    for text in texts:
        chunks = arabic_split_text(text)
        all_chunks.extend(chunks)

    vectors = embeddings.embed_documents(all_chunks)
    dimension = len(vectors[0])
    index = faiss.IndexFlatL2(dimension)
    vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)

    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

def upload_book(file, progress=gr.Progress()):
    with tempfile.NamedTemporaryFile(delete=False) as tmp:
        tmp.write(file.read())
        tmp_path = tmp.name

    progress(0.2, desc="تحميل الملف...")
    extracted_text = extract_text_from_file(tmp_path)
    uploaded_texts.append(extracted_text)
    progress(0.5, desc="معالجة النص...")

    train_from_texts(uploaded_texts)
    progress(1.0, desc="اكتمل التدريب!")
    return "النظام جاهز للإجابة على أسئلتك"

def answer_question(user_question):
    if qa_chain is None:
        return "الرجاء رفع كتاب أولاً."
    prompt = format_arabic_prompt(user_question)
    result = qa_chain.run(prompt)
    return result

with gr.Blocks() as demo:
    with gr.Tab("تحميل الكتب"):
        upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
        upload_output = gr.Textbox(label="حالة النظام")
        upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)

    with gr.Tab("اسأل الكتاب"):
        question = gr.Textbox(label="اكتب سؤالك بالعربية")
        answer = gr.Textbox(label="الإجابة")
        ask_button = gr.Button("إرسال السؤال")
        ask_button.click(answer_question, inputs=question, outputs=answer)

demo.launch(share=True)