Spaces:
Sleeping
Sleeping
File size: 3,542 Bytes
0ffbfee a503e7e 0ffbfee a503e7e 0ffbfee a62dca0 0ffbfee 3b6dd97 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a62dca0 0ffbfee a503e7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# app.py
import os
import shutil
import chromadb
import gradio as gr
from ctransformers import AutoModelForCausalLM
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
# 1. إعداد نموذج LLM بدون API باستخدام ctransformers
llm = AutoModelForCausalLM.from_pretrained(
model_path_or_repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
model_type="mistral",
config={"max_new_tokens": 512, "temperature": 0.7}
)
# 2. إعداد مجلد التخزين
CHROMA_DIR = "chroma_store"
if os.path.exists(CHROMA_DIR):
shutil.rmtree(CHROMA_DIR)
# 3. تحميل الملفات وتقسيمها
SUPPORTED_TYPES = {"pdf": PyPDFLoader, "docx": Docx2txtLoader, "txt": TextLoader}
def load_documents(file_paths):
documents = []
for path in file_paths:
ext = path.split(".")[-1].lower()
loader_class = SUPPORTED_TYPES.get(ext)
if loader_class:
loader = loader_class(path)
docs = loader.load()
documents.extend(docs)
return documents
# 4. تقسيم النصوص وإنشاء المتجهات
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
def create_vectorstore(docs):
texts = text_splitter.split_documents(docs)
return Chroma.from_documents(texts, embedding, persist_directory=CHROMA_DIR)
# 5. واجهة Gradio
uploaded_files = []
db = None
qa_chain = None
def upload_files(files):
global uploaded_files, db, qa_chain
uploaded_paths = [f.name for f in files]
uploaded_files.extend(uploaded_paths)
docs = load_documents(uploaded_paths)
db = create_vectorstore(docs)
retriever = db.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
return "تم تحميل الملفات وبناء قاعدة المعرفة بنجاح ✅"
def answer_question_arabic(question):
if not qa_chain:
return "من فضلك قم أولاً بتحميل ملفاتك وبناء قاعدة المعرفة."
result = qa_chain.run(question)
return result
with gr.Blocks(theme=gr.themes.Soft(), rtl=True, title="Smart PDF Assistant") as demo:
gr.Markdown("""
# 🤖 مساعد الوثائق الذكي باللغة العربية
أرفق ملفاتك (PDF، DOCX، TXT) ثم اسأل أي سؤال.
""")
with gr.Row():
file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], file_count="multiple", label="📁 ارفع ملفاتك")
upload_button = gr.Button("تحميل الملفات")
status_output = gr.Textbox(label="الحالة")
with gr.Row():
question_input = gr.Textbox(lines=2, placeholder="✍️ اكتب سؤالك هنا", label="السؤال")
answer_button = gr.Button("أرسل")
answer_output = gr.Textbox(label="الإجابة", lines=5)
upload_button.click(upload_files, inputs=[file_input], outputs=[status_output])
answer_button.click(answer_question_arabic, inputs=[question_input], outputs=[answer_output])
if __name__ == "__main__":
demo.launch()
|