Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

ramysaidagieb commited on Apr 27

Commit

8876843

verified ·

1 Parent(s): ea67549

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -48

app.py CHANGED Viewed

@@ -1,30 +1,30 @@
-import gradio as gr
 import os
 import tempfile
-import pdfminer.high_level
-import docx2txt
 import faiss
 import numpy as np
-from tqdm import tqdm
-from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 # Load Arabic embedding model
-embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
-# FAISS index (vector store)
 index = None
 texts = []
-# Function to extract text from PDF
 def extract_text_from_pdf(pdf_path):
-    return pdfminer.high_level.extract_text(pdf_path)
-# Function to extract text from DOCX
 def extract_text_from_docx(docx_path):
-    return docx2txt.process(docx_path)
-# Function to process uploaded files
 def process_files(files, progress=gr.Progress()):
     global index, texts
     texts = []
@@ -32,11 +32,11 @@ def process_files(files, progress=gr.Progress()):
     temp_dir = tempfile.mkdtemp()
     # Step 1: Extract text
-    progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
     for file in files:
         file_path = os.path.join(temp_dir, file.name)
         with open(file_path, "wb") as f:
-            f.write(file.read())
         if file.name.endswith(".pdf"):
             text = extract_text_from_pdf(file_path)
@@ -48,58 +48,54 @@ def process_files(files, progress=gr.Progress()):
         texts.append(text)
     # Step 2: Chunk the text
-    progress(0.4, desc="تقطيع النصوص إلى فقرات...")
     splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     chunks = []
     for text in texts:
         chunks.extend(splitter.split_text(text))
     # Step 3: Embed the text
-    progress(0.7, desc="تحويل الفقرات إلى متجهات...")
     embeddings = embedding_model.encode(chunks, show_progress_bar=True)
     # Step 4: Build FAISS index
-    progress(0.9, desc="بناء قاعدة بيانات البحث...")
     index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(np.array(embeddings))
     texts.clear()
     texts.extend(chunks)
-    return "✅ النظام جاهز للإجابة على أسئلتك"
-# Function to answer Arabic questions
 def answer_question(question):
-    global index, texts
-    if index is None or len(texts) == 0:
-        return "❗ من فضلك قم بتحميل الكتب أولاً."
-    # Embed the question
-    question_embedding = embedding_model.encode([question])
-    # Search in FAISS
-    distances, indices = index.search(np.array(question_embedding), k=5)
-    retrieved_chunks = [texts[i] for i in indices[0]]
-    # Simple answer: concatenate most relevant chunks
-    answer = "\n".join(retrieved_chunks)
-    return answer
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
-    with gr.Tab("رفع الكتب"):
-        upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
-        train_button = gr.Button("ابدأ التدريب على الكتب")
-        training_output = gr.Textbox(label="حالة التدريب")
-    with gr.Tab("اسأل الكتب"):
-        question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
-        answer_output = gr.Textbox(label="الإجابة")
-        ask_button = gr.Button("أرسل السؤال")
-    train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
-    ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])
 demo.launch(share=True)

+# Creating your fully corrected Hugging Face Space project
+# app.py
 import os
 import tempfile
+import gradio as gr
 import faiss
 import numpy as np
+from transformers import AutoModel, AutoTokenizer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from pdfminer.high_level import extract_text
+from docx import Document
 # Load Arabic embedding model
+embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 index = None
 texts = []
 def extract_text_from_pdf(pdf_path):
+    return extract_text(pdf_path)
 def extract_text_from_docx(docx_path):
+    doc = Document(docx_path)
+    return "\n".join([para.text for para in doc.paragraphs])
 def process_files(files, progress=gr.Progress()):
     global index, texts
     texts = []
     temp_dir = tempfile.mkdtemp()
     # Step 1: Extract text
+    progress(0.1, desc="\u062c\u0627\u0631\u0650 \u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0646\u0635\u0648\u0635 \u0645\u0646 \u0627\u0644\u0643\u062a\u0628...")
     for file in files:
         file_path = os.path.join(temp_dir, file.name)
         with open(file_path, "wb") as f:
+            f.write(file.file.read())
         if file.name.endswith(".pdf"):
             text = extract_text_from_pdf(file_path)
         texts.append(text)
     # Step 2: Chunk the text
+    progress(0.4, desc="\u062a\u0642\u0637\u064a\u0639 \u0627\u0644\u0646\u0635\u0648\u0635 \u0625\u0644\u0649 \u0641\u0642\u0631\u0627\u062a...")
     splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     chunks = []
     for text in texts:
         chunks.extend(splitter.split_text(text))
     # Step 3: Embed the text
+    progress(0.7, desc="\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0641\u0642\u0631\u0627\u062a \u0625\u0644\u0649 \u0645\u062a\u062c\u0647\u0627\u062a...")
     embeddings = embedding_model.encode(chunks, show_progress_bar=True)
     # Step 4: Build FAISS index
+    progress(0.9, desc="\u0628\u0646\u0627\u0621 \u0642\u0627\u0639\u062f\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0628\u062d\u062b...")
+    embeddings = np.array(embeddings).astype(np.float32)
     index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
     texts.clear()
     texts.extend(chunks)
+    return "\u2705 \u0627\u0644\u0646\u0638\u0627\u0645 \u062c\u0627\u0647\u0632 \u0644\u0644\u0625\u062c\u0627\u0628\u0629 \u0639\u0644\u0649 \u0623\u0633\u0626\u0644\u062a\u0643"
 def answer_question(question):
+    if index is None:
+        return "\u064a\u0631\u062c\u0649 \u062a\u062d\u0645\u064a\u0644 \u0643\u062a\u0628 \u0648\u0627\u0644\u0646\u0642\u0631 \u0639\u0644\u0649 \"\u0627\u0628\u062f\u0623 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\" \u0623\u0648\u0644\u0627"
+    embedded_question = embedding_model.encode([question]).astype(np.float32)
+    D, I = index.search(embedded_question, k=1)
+    if len(I[0]) == 0:
+        return "\u0644\u0645 \u064a\u062a\u0645 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0625\u062c\u0627\u0628\u0629."
+    answer = texts[I[0][0]]
+    return answer
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# \ud83d\udcda محاكاة دماغ المؤلف بناءً على الكتب المرفوعة")
+    with gr.Row():
+        files = gr.File(label="ارفع ملفات الكتب", file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
+        upload_button = gr.Button("ابدأ التدريب على الكتب")
+    output_text = gr.Textbox(label="مخرجات التدريب", interactive=False)
+    upload_button.click(fn=process_files, inputs=[files], outputs=[output_text])
+    gr.Markdown("## اطرح سؤالك بعد إكمال التدريب:")
+    question = gr.Textbox(label="سؤالك بالعربية")
+    answer = gr.Textbox(label="الإجابة", interactive=False)
+    ask_button = gr.Button("أجب عن سؤالي")
+    ask_button.click(fn=answer_question, inputs=[question], outputs=[answer])
 demo.launch(share=True)