Spaces:

ramysaidagieb
/

brain247v1

Sleeping

File size: 3,051 Bytes

d9c732d
 
 
2d232ac
d9c732d
2d232ac
d9c732d
2d232ac
d9c732d
2d232ac
d9c732d
 
2d232ac
d9c732d
 
 
2d232ac
d9c732d
 
 
 
 
 
 
 
2d232ac
d9c732d
 
 
2d232ac
d9c732d
 
2d232ac
d9c732d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d232ac
 
 
d9c732d
 
 
 
 
2d232ac
d9c732d
 
 
 
 
 
2d232ac
d9c732d
 
 
2d232ac
d9c732d
 
2d232ac

import os
import shutil
import tempfile
import gradio as gr

from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import LiteLLM

DB_DIR = "chroma_db"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

def load_documents(file_path):
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".docx") or file_path.endswith(".doc"):
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError("Unsupported file type. Only PDF and DOCX are supported.")
    return loader.load()

def create_vector_store(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    texts = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory=DB_DIR)
    vectordb.persist()
    return vectordb

def process_file(file):
    temp_path = file.name
    target_path = os.path.join(tempfile.gettempdir(), os.path.basename(temp_path))

    if os.path.abspath(temp_path) != os.path.abspath(target_path):
        shutil.copy(temp_path, target_path)

    documents = load_documents(target_path)

    if os.path.exists(DB_DIR):
        shutil.rmtree(DB_DIR)

    vectordb = create_vector_store(documents)
    return "✅ تم معالجة الملف بنجاح. يمكنك الآن كتابة سؤالك."

def ask_question(question):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    vectordb = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

    retriever = vectordb.as_retriever()

    llm = LiteLLM(model="mistralai/Mistral-7B-Instruct-v0.2")  # لا حاجة لمفتاح API
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    result = qa_chain.run(question)
    return result

with gr.Blocks(title="Smart PDF Assistant") as demo:
    gr.Markdown("### 🤖 مساعد الكتب الذكي - اسأل أي سؤال بناءً على ملف PDF أو DOCX")
    
    with gr.Row():
        file_input = gr.File(label="📄 ارفع ملف PDF أو DOCX", file_types=[".pdf", ".docx", ".doc"])
        file_status = gr.Textbox(label="حالة الملف", interactive=False)

    with gr.Row():
        question_input = gr.Textbox(label="❓ اكتب سؤالك هنا", placeholder="ما هو إيمان الكنيسة؟")
        answer_output = gr.Textbox(label="📘 الإجابة", lines=8)

    file_input.change(process_file, inputs=file_input, outputs=file_status)
    question_input.submit(ask_question, inputs=question_input, outputs=answer_output)

demo.launch()