Spaces:
Sleeping
Sleeping
import os | |
import shutil | |
import tempfile | |
import gradio as gr | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain.llms import LiteLLM | |
DB_DIR = "chroma_db" | |
CHUNK_SIZE = 500 | |
CHUNK_OVERLAP = 50 | |
def load_documents(file_path): | |
if file_path.endswith(".pdf"): | |
loader = PyPDFLoader(file_path) | |
elif file_path.endswith(".docx") or file_path.endswith(".doc"): | |
loader = UnstructuredWordDocumentLoader(file_path) | |
else: | |
raise ValueError("Unsupported file type. Only PDF and DOCX are supported.") | |
return loader.load() | |
def create_vector_store(documents): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) | |
texts = text_splitter.split_documents(documents) | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") | |
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory=DB_DIR) | |
vectordb.persist() | |
return vectordb | |
def process_file(file): | |
temp_path = file.name | |
target_path = os.path.join(tempfile.gettempdir(), os.path.basename(temp_path)) | |
if os.path.abspath(temp_path) != os.path.abspath(target_path): | |
shutil.copy(temp_path, target_path) | |
documents = load_documents(target_path) | |
if os.path.exists(DB_DIR): | |
shutil.rmtree(DB_DIR) | |
vectordb = create_vector_store(documents) | |
return "✅ تم معالجة الملف بنجاح. يمكنك الآن كتابة سؤالك." | |
def ask_question(question): | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") | |
vectordb = Chroma(persist_directory=DB_DIR, embedding_function=embeddings) | |
retriever = vectordb.as_retriever() | |
llm = LiteLLM(model="mistralai/Mistral-7B-Instruct-v0.2") # لا حاجة لمفتاح API | |
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) | |
result = qa_chain.run(question) | |
return result | |
with gr.Blocks(title="Smart PDF Assistant") as demo: | |
gr.Markdown("### 🤖 مساعد الكتب الذكي - اسأل أي سؤال بناءً على ملف PDF أو DOCX") | |
with gr.Row(): | |
file_input = gr.File(label="📄 ارفع ملف PDF أو DOCX", file_types=[".pdf", ".docx", ".doc"]) | |
file_status = gr.Textbox(label="حالة الملف", interactive=False) | |
with gr.Row(): | |
question_input = gr.Textbox(label="❓ اكتب سؤالك هنا", placeholder="ما هو إيمان الكنيسة؟") | |
answer_output = gr.Textbox(label="📘 الإجابة", lines=8) | |
file_input.change(process_file, inputs=file_input, outputs=file_status) | |
question_input.submit(ask_question, inputs=question_input, outputs=answer_output) | |
demo.launch() | |