File size: 3,542 Bytes
0ffbfee
a503e7e
0ffbfee
 
a503e7e
0ffbfee
 
 
a62dca0
0ffbfee
 
 
3b6dd97
0ffbfee
 
 
 
a62dca0
0ffbfee
a62dca0
 
0ffbfee
 
 
 
a62dca0
0ffbfee
 
a62dca0
0ffbfee
 
 
 
 
 
 
 
 
 
a62dca0
0ffbfee
 
 
a62dca0
0ffbfee
 
 
a62dca0
0ffbfee
 
 
 
a62dca0
0ffbfee
 
 
 
 
 
 
 
 
a62dca0
0ffbfee
 
 
 
 
a62dca0
0ffbfee
 
 
 
 
a62dca0
 
0ffbfee
 
 
 
a62dca0
 
0ffbfee
 
 
 
 
 
a503e7e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# app.py
import os
import shutil
import chromadb
import gradio as gr
from ctransformers import AutoModelForCausalLM
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers

# 1. إعداد نموذج LLM بدون API باستخدام ctransformers
llm = AutoModelForCausalLM.from_pretrained(
    model_path_or_repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
    model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    config={"max_new_tokens": 512, "temperature": 0.7}
)

# 2. إعداد مجلد التخزين
CHROMA_DIR = "chroma_store"
if os.path.exists(CHROMA_DIR):
    shutil.rmtree(CHROMA_DIR)

# 3. تحميل الملفات وتقسيمها
SUPPORTED_TYPES = {"pdf": PyPDFLoader, "docx": Docx2txtLoader, "txt": TextLoader}

def load_documents(file_paths):
    documents = []
    for path in file_paths:
        ext = path.split(".")[-1].lower()
        loader_class = SUPPORTED_TYPES.get(ext)
        if loader_class:
            loader = loader_class(path)
            docs = loader.load()
            documents.extend(docs)
    return documents

# 4. تقسيم النصوص وإنشاء المتجهات
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

def create_vectorstore(docs):
    texts = text_splitter.split_documents(docs)
    return Chroma.from_documents(texts, embedding, persist_directory=CHROMA_DIR)

# 5. واجهة Gradio
uploaded_files = []
db = None
qa_chain = None

def upload_files(files):
    global uploaded_files, db, qa_chain
    uploaded_paths = [f.name for f in files]
    uploaded_files.extend(uploaded_paths)
    docs = load_documents(uploaded_paths)
    db = create_vectorstore(docs)
    retriever = db.as_retriever(search_kwargs={"k": 5})
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return "تم تحميل الملفات وبناء قاعدة المعرفة بنجاح ✅"

def answer_question_arabic(question):
    if not qa_chain:
        return "من فضلك قم أولاً بتحميل ملفاتك وبناء قاعدة المعرفة."
    result = qa_chain.run(question)
    return result

with gr.Blocks(theme=gr.themes.Soft(), rtl=True, title="Smart PDF Assistant") as demo:
    gr.Markdown("""
    # 🤖 مساعد الوثائق الذكي باللغة العربية
    أرفق ملفاتك (PDF، DOCX، TXT) ثم اسأل أي سؤال.
    """)

    with gr.Row():
        file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], file_count="multiple", label="📁 ارفع ملفاتك")
        upload_button = gr.Button("تحميل الملفات")

    status_output = gr.Textbox(label="الحالة")

    with gr.Row():
        question_input = gr.Textbox(lines=2, placeholder="✍️ اكتب سؤالك هنا", label="السؤال")
        answer_button = gr.Button("أرسل")
    answer_output = gr.Textbox(label="الإجابة", lines=5)

    upload_button.click(upload_files, inputs=[file_input], outputs=[status_output])
    answer_button.click(answer_question_arabic, inputs=[question_input], outputs=[answer_output])

if __name__ == "__main__":
    demo.launch()