import gradio as gr import os import tempfile import pdfminer.high_level import docx2txt import faiss import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer from langchain.text_splitter import RecursiveCharacterTextSplitter # Load Arabic embedding model embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') # FAISS index (vector store) index = None texts = [] # Function to extract text from PDF def extract_text_from_pdf(pdf_path): return pdfminer.high_level.extract_text(pdf_path) # Function to extract text from DOCX def extract_text_from_docx(docx_path): return docx2txt.process(docx_path) # Function to process uploaded files def process_files(files, progress=gr.Progress()): global index, texts texts = [] temp_dir = tempfile.mkdtemp() # Step 1: Extract text progress(0.1, desc="جارٍ استخراج النصوص من الكتب...") for file in files: file_path = os.path.join(temp_dir, file.name) with open(file_path, "wb") as f: f.write(file.read()) if file.name.endswith(".pdf"): text = extract_text_from_pdf(file_path) elif file.name.endswith(".docx") or file.name.endswith(".doc"): text = extract_text_from_docx(file_path) else: continue texts.append(text) # Step 2: Chunk the text progress(0.4, desc="تقطيع النصوص إلى فقرات...") splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = [] for text in texts: chunks.extend(splitter.split_text(text)) # Step 3: Embed the text progress(0.7, desc="تحويل الفقرات إلى متجهات...") embeddings = embedding_model.encode(chunks, show_progress_bar=True) # Step 4: Build FAISS index progress(0.9, desc="بناء قاعدة بيانات البحث...") index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings)) texts.clear() texts.extend(chunks) return "✅ النظام جاهز للإجابة على أسئلتك" # Function to answer Arabic questions def answer_question(question): global index, texts if index is None or len(texts) == 0: return "❗ من فضلك قم بتحميل الكتب أولاً." # Embed the question question_embedding = embedding_model.encode([question]) # Search in FAISS distances, indices = index.search(np.array(question_embedding), k=5) retrieved_chunks = [texts[i] for i in indices[0]] # Simple answer: concatenate most relevant chunks answer = "\n".join(retrieved_chunks) return answer # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI") with gr.Tab("رفع الكتب"): upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple") train_button = gr.Button("ابدأ التدريب على الكتب") training_output = gr.Textbox(label="حالة التدريب") with gr.Tab("اسأل الكتب"): question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية") answer_output = gr.Textbox(label="الإجابة") ask_button = gr.Button("أرسل السؤال") train_button.click(fn=process_files, inputs=[upload], outputs=[training_output]) ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output]) demo.launch(share=True)