Spaces:
Sleeping
Sleeping
File size: 3,583 Bytes
d28c712 66289a9 d28c712 66289a9 d28c712 66289a9 d28c712 66289a9 d28c712 66289a9 d28c712 66289a9 d28c712 66289a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
import os
import tempfile
import pdfminer.high_level
import docx2txt
import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load Arabic embedding model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# FAISS index (vector store)
index = None
texts = []
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
return pdfminer.high_level.extract_text(pdf_path)
# Function to extract text from DOCX
def extract_text_from_docx(docx_path):
return docx2txt.process(docx_path)
# Function to process uploaded files
def process_files(files, progress=gr.Progress()):
global index, texts
texts = []
temp_dir = tempfile.mkdtemp()
# Step 1: Extract text
progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
for file in files:
file_path = os.path.join(temp_dir, file.name)
with open(file_path, "wb") as f:
f.write(file.read())
if file.name.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file.name.endswith(".docx") or file.name.endswith(".doc"):
text = extract_text_from_docx(file_path)
else:
continue
texts.append(text)
# Step 2: Chunk the text
progress(0.4, desc="تقطيع النصوص إلى فقرات...")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
for text in texts:
chunks.extend(splitter.split_text(text))
# Step 3: Embed the text
progress(0.7, desc="تحويل الفقرات إلى متجهات...")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
# Step 4: Build FAISS index
progress(0.9, desc="بناء قاعدة بيانات البحث...")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
texts.clear()
texts.extend(chunks)
return "✅ النظام جاهز للإجابة على أسئلتك"
# Function to answer Arabic questions
def answer_question(question):
global index, texts
if index is None or len(texts) == 0:
return "❗ من فضلك قم بتحميل الكتب أولاً."
# Embed the question
question_embedding = embedding_model.encode([question])
# Search in FAISS
distances, indices = index.search(np.array(question_embedding), k=5)
retrieved_chunks = [texts[i] for i in indices[0]]
# Simple answer: concatenate most relevant chunks
answer = "\n".join(retrieved_chunks)
return answer
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
with gr.Tab("رفع الكتب"):
upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
train_button = gr.Button("ابدأ التدريب على الكتب")
training_output = gr.Textbox(label="حالة التدريب")
with gr.Tab("اسأل الكتب"):
question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
answer_output = gr.Textbox(label="الإجابة")
ask_button = gr.Button("أرسل السؤال")
train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])
demo.launch(share=True)
|