Spaces:

ramysaidagieb
/

Answer1

Sleeping

File size: 4,256 Bytes

d28c712

import gradio as gr
import os
import tempfile
import faiss
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from pdfminer.high_level import extract_text as extract_pdf_text
import docx
import nltk

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

uploaded_texts = []
vector_store = None
qa_chain = None

embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

model_name = "csebuetnlp/mT5_small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)

ARABIC_PROMPT_TEMPLATE = """
أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
لا تستخدم أي معلومات خارجية.
السؤال: {question}
الإجابة:
"""

def format_arabic_prompt(question):
    return ARABIC_PROMPT_TEMPLATE.format(question=question)

def extract_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        return extract_pdf_text(file_path)
    elif file_path.endswith(".docx") or file_path.endswith(".doc"):
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")

def arabic_split_text(text):
    sentences = sent_tokenize(text, language='arabic')
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= 500:
            chunk += " " + sentence
        else:
            chunks.append(chunk.strip())
            chunk = sentence
    if chunk:
        chunks.append(chunk.strip())
    return chunks

def train_from_texts(texts):
    global vector_store, qa_chain

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len,
    )
    
    all_chunks = []
    for text in texts:
        chunks = arabic_split_text(text)
        all_chunks.extend(chunks)

    vectors = embeddings.embed_documents(all_chunks)
    dimension = len(vectors[0])
    index = faiss.IndexFlatL2(dimension)
    vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)

    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

def upload_book(file, progress=gr.Progress()):
    with tempfile.NamedTemporaryFile(delete=False) as tmp:
        tmp.write(file.read())
        tmp_path = tmp.name

    progress(0.2, desc="تحميل الملف...")
    extracted_text = extract_text_from_file(tmp_path)
    uploaded_texts.append(extracted_text)
    progress(0.5, desc="معالجة النص...")

    train_from_texts(uploaded_texts)
    progress(1.0, desc="اكتمل التدريب!")
    return "النظام جاهز للإجابة على أسئلتك"

def answer_question(user_question):
    if qa_chain is None:
        return "الرجاء رفع كتاب أولاً."
    prompt = format_arabic_prompt(user_question)
    result = qa_chain.run(prompt)
    return result

with gr.Blocks() as demo:
    with gr.Tab("تحميل الكتب"):
        upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
        upload_output = gr.Textbox(label="حالة النظام")
        upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)

    with gr.Tab("اسأل الكتاب"):
        question = gr.Textbox(label="اكتب سؤالك بالعربية")
        answer = gr.Textbox(label="الإجابة")
        ask_button = gr.Button("إرسال السؤال")
        ask_button.click(answer_question, inputs=question, outputs=answer)

demo.launch(share=True)