import gradio as gr import os import tempfile import faiss import torch from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA from langchain.llms import HuggingFacePipeline from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from pdfminer.high_level import extract_text as extract_pdf_text import docx import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize uploaded_texts = [] vector_store = None qa_chain = None embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix" embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) model_name = "csebuetnlp/mT5_small" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512) llm = HuggingFacePipeline(pipeline=pipe) ARABIC_PROMPT_TEMPLATE = """ أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب. لا تستخدم أي معلومات خارجية. السؤال: {question} الإجابة: """ def format_arabic_prompt(question): return ARABIC_PROMPT_TEMPLATE.format(question=question) def extract_text_from_file(file_path): if file_path.endswith(".pdf"): return extract_pdf_text(file_path) elif file_path.endswith(".docx") or file_path.endswith(".doc"): doc = docx.Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) else: raise ValueError("Unsupported file format") def arabic_split_text(text): sentences = sent_tokenize(text, language='arabic') chunks = [] chunk = "" for sentence in sentences: if len(chunk) + len(sentence) <= 500: chunk += " " + sentence else: chunks.append(chunk.strip()) chunk = sentence if chunk: chunks.append(chunk.strip()) return chunks def train_from_texts(texts): global vector_store, qa_chain splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100, length_function=len, ) all_chunks = [] for text in texts: chunks = arabic_split_text(text) all_chunks.extend(chunks) vectors = embeddings.embed_documents(all_chunks) dimension = len(vectors[0]) index = faiss.IndexFlatL2(dimension) vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks) retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10}) qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) def upload_book(file, progress=gr.Progress()): with tempfile.NamedTemporaryFile(delete=False) as tmp: tmp.write(file.read()) tmp_path = tmp.name progress(0.2, desc="تحميل الملف...") extracted_text = extract_text_from_file(tmp_path) uploaded_texts.append(extracted_text) progress(0.5, desc="معالجة النص...") train_from_texts(uploaded_texts) progress(1.0, desc="اكتمل التدريب!") return "النظام جاهز للإجابة على أسئلتك" def answer_question(user_question): if qa_chain is None: return "الرجاء رفع كتاب أولاً." prompt = format_arabic_prompt(user_question) result = qa_chain.run(prompt) return result with gr.Blocks() as demo: with gr.Tab("تحميل الكتب"): upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"]) upload_output = gr.Textbox(label="حالة النظام") upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output) with gr.Tab("اسأل الكتاب"): question = gr.Textbox(label="اكتب سؤالك بالعربية") answer = gr.Textbox(label="الإجابة") ask_button = gr.Button("إرسال السؤال") ask_button.click(answer_question, inputs=question, outputs=answer) demo.launch(share=True)