Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import tempfile | |
import faiss | |
import torch | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import RetrievalQA | |
from langchain.llms import HuggingFacePipeline | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
from pdfminer.high_level import extract_text as extract_pdf_text | |
import docx | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import sent_tokenize | |
uploaded_texts = [] | |
vector_store = None | |
qa_chain = None | |
embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix" | |
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) | |
model_name = "csebuetnlp/mT5_small" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512) | |
llm = HuggingFacePipeline(pipeline=pipe) | |
ARABIC_PROMPT_TEMPLATE = """ | |
أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب. | |
لا تستخدم أي معلومات خارجية. | |
السؤال: {question} | |
الإجابة: | |
""" | |
def format_arabic_prompt(question): | |
return ARABIC_PROMPT_TEMPLATE.format(question=question) | |
def extract_text_from_file(file_path): | |
if file_path.endswith(".pdf"): | |
return extract_pdf_text(file_path) | |
elif file_path.endswith(".docx") or file_path.endswith(".doc"): | |
doc = docx.Document(file_path) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
else: | |
raise ValueError("Unsupported file format") | |
def arabic_split_text(text): | |
sentences = sent_tokenize(text, language='arabic') | |
chunks = [] | |
chunk = "" | |
for sentence in sentences: | |
if len(chunk) + len(sentence) <= 500: | |
chunk += " " + sentence | |
else: | |
chunks.append(chunk.strip()) | |
chunk = sentence | |
if chunk: | |
chunks.append(chunk.strip()) | |
return chunks | |
def train_from_texts(texts): | |
global vector_store, qa_chain | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=100, | |
length_function=len, | |
) | |
all_chunks = [] | |
for text in texts: | |
chunks = arabic_split_text(text) | |
all_chunks.extend(chunks) | |
vectors = embeddings.embed_documents(all_chunks) | |
dimension = len(vectors[0]) | |
index = faiss.IndexFlatL2(dimension) | |
vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks) | |
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10}) | |
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) | |
def upload_book(file, progress=gr.Progress()): | |
with tempfile.NamedTemporaryFile(delete=False) as tmp: | |
tmp.write(file.read()) | |
tmp_path = tmp.name | |
progress(0.2, desc="تحميل الملف...") | |
extracted_text = extract_text_from_file(tmp_path) | |
uploaded_texts.append(extracted_text) | |
progress(0.5, desc="معالجة النص...") | |
train_from_texts(uploaded_texts) | |
progress(1.0, desc="اكتمل التدريب!") | |
return "النظام جاهز للإجابة على أسئلتك" | |
def answer_question(user_question): | |
if qa_chain is None: | |
return "الرجاء رفع كتاب أولاً." | |
prompt = format_arabic_prompt(user_question) | |
result = qa_chain.run(prompt) | |
return result | |
with gr.Blocks() as demo: | |
with gr.Tab("تحميل الكتب"): | |
upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"]) | |
upload_output = gr.Textbox(label="حالة النظام") | |
upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output) | |
with gr.Tab("اسأل الكتاب"): | |
question = gr.Textbox(label="اكتب سؤالك بالعربية") | |
answer = gr.Textbox(label="الإجابة") | |
ask_button = gr.Button("إرسال السؤال") | |
ask_button.click(answer_question, inputs=question, outputs=answer) | |
demo.launch(share=True) |