Answer1 / app.py
ramysaidagieb's picture
Upload 5 files
d28c712 verified
raw
history blame
4.26 kB
import gradio as gr
import os
import tempfile
import faiss
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from pdfminer.high_level import extract_text as extract_pdf_text
import docx
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
uploaded_texts = []
vector_store = None
qa_chain = None
embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
model_name = "csebuetnlp/mT5_small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)
ARABIC_PROMPT_TEMPLATE = """
أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
لا تستخدم أي معلومات خارجية.
السؤال: {question}
الإجابة:
"""
def format_arabic_prompt(question):
return ARABIC_PROMPT_TEMPLATE.format(question=question)
def extract_text_from_file(file_path):
if file_path.endswith(".pdf"):
return extract_pdf_text(file_path)
elif file_path.endswith(".docx") or file_path.endswith(".doc"):
doc = docx.Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
else:
raise ValueError("Unsupported file format")
def arabic_split_text(text):
sentences = sent_tokenize(text, language='arabic')
chunks = []
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) <= 500:
chunk += " " + sentence
else:
chunks.append(chunk.strip())
chunk = sentence
if chunk:
chunks.append(chunk.strip())
return chunks
def train_from_texts(texts):
global vector_store, qa_chain
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100,
length_function=len,
)
all_chunks = []
for text in texts:
chunks = arabic_split_text(text)
all_chunks.extend(chunks)
vectors = embeddings.embed_documents(all_chunks)
dimension = len(vectors[0])
index = faiss.IndexFlatL2(dimension)
vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
def upload_book(file, progress=gr.Progress()):
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.write(file.read())
tmp_path = tmp.name
progress(0.2, desc="تحميل الملف...")
extracted_text = extract_text_from_file(tmp_path)
uploaded_texts.append(extracted_text)
progress(0.5, desc="معالجة النص...")
train_from_texts(uploaded_texts)
progress(1.0, desc="اكتمل التدريب!")
return "النظام جاهز للإجابة على أسئلتك"
def answer_question(user_question):
if qa_chain is None:
return "الرجاء رفع كتاب أولاً."
prompt = format_arabic_prompt(user_question)
result = qa_chain.run(prompt)
return result
with gr.Blocks() as demo:
with gr.Tab("تحميل الكتب"):
upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
upload_output = gr.Textbox(label="حالة النظام")
upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)
with gr.Tab("اسأل الكتاب"):
question = gr.Textbox(label="اكتب سؤالك بالعربية")
answer = gr.Textbox(label="الإجابة")
ask_button = gr.Button("إرسال السؤال")
ask_button.click(answer_question, inputs=question, outputs=answer)
demo.launch(share=True)