Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

ramysaidagieb commited on Apr 27

Commit

5ef2861

verified ·

1 Parent(s): 54dcc54

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -74

app.py CHANGED Viewed

@@ -1,101 +1,125 @@
-# Creating your fully corrected Hugging Face Space project
-# app.py
-import os
-import tempfile
 import gradio as gr
 import faiss
 import numpy as np
-from transformers import AutoModel, AutoTokenizer
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 from pdfminer.high_level import extract_text
-from docx import Document
-# Load Arabic embedding model
-embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 index = None
 texts = []
-def extract_text_from_pdf(pdf_path):
-    return extract_text(pdf_path)
-def extract_text_from_docx(docx_path):
-    doc = Document(docx_path)
-    return "\n".join([para.text for para in doc.paragraphs])
 def process_files(files, progress=gr.Progress()):
     global index, texts
-    texts = []
     temp_dir = tempfile.mkdtemp()
-    # Step 1: Extract text
-    progress(0.1, desc="\u062c\u0627\u0631\u0650 \u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0646\u0635\u0648\u0635 \u0645\u0646 \u0627\u0644\u0643\u062a\u0628...")
-    for file in files:
-        file_path = os.path.join(temp_dir, file.name)
-        with open(file_path, "wb") as f:
-            f.write(file.file.read())
-        if file.name.endswith(".pdf"):
-            text = extract_text_from_pdf(file_path)
-        elif file.name.endswith(".docx") or file.name.endswith(".doc"):
-            text = extract_text_from_docx(file_path)
-        else:
-            continue
-        texts.append(text)
-    # Step 2: Chunk the text
-    progress(0.4, desc="\u062a\u0642\u0637\u064a\u0639 \u0627\u0644\u0646\u0635\u0648\u0635 \u0625\u0644\u0649 \u0641\u0642\u0631\u0627\u062a...")
-    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-    chunks = []
-    for text in texts:
-        chunks.extend(splitter.split_text(text))
-    # Step 3: Embed the text
-    progress(0.7, desc="\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0641\u0642\u0631\u0627\u062a \u0625\u0644\u0649 \u0645\u062a\u062c\u0647\u0627\u062a...")
-    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
-    # Step 4: Build FAISS index
-    progress(0.9, desc="\u0628\u0646\u0627\u0621 \u0642\u0627\u0639\u062f\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0628\u062d\u062b...")
-    embeddings = np.array(embeddings).astype(np.float32)
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    texts.clear()
-    texts.extend(chunks)
-    return "\u2705 \u0627\u0644\u0646\u0638\u0627\u0645 \u062c\u0627\u0647\u0632 \u0644\u0644\u0625\u062c\u0627\u0628\u0629 \u0639\u0644\u0649 \u0623\u0633\u0626\u0644\u062a\u0643"
 def answer_question(question):
-    if index is None:
-        return "\u064a\u0631\u062c\u0649 \u062a\u062d\u0645\u064a\u0644 \u0643\u062a\u0628 \u0648\u0627\u0644\u0646\u0642\u0631 \u0639\u0644\u0649 \"\u0627\u0628\u062f\u0623 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\" \u0623\u0648\u0644\u0627"
-    embedded_question = embedding_model.encode([question]).astype(np.float32)
-    D, I = index.search(embedded_question, k=1)
-    if len(I[0]) == 0:
-        return "\u0644\u0645 \u064a\u062a\u0645 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0625\u062c\u0627\u0628\u0629."
-    answer = texts[I[0][0]]
-    return answer
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# \ud83d\udcda محاكاة دماغ المؤلف بناءً على الكتب المرفوعة")
     with gr.Row():
-        files = gr.File(label="ارفع ملفات الكتب", file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
-        upload_button = gr.Button("ابدأ التدريب على الكتب")
-    output_text = gr.Textbox(label="مخرجات التدريب", interactive=False)
-    upload_button.click(fn=process_files, inputs=[files], outputs=[output_text])
-    gr.Markdown("## اطرح سؤالك بعد إكمال التدريب:")
-    question = gr.Textbox(label="سؤالك بالعربية")
-    answer = gr.Textbox(label="الإجابة", interactive=False)
-    ask_button = gr.Button("أجب عن سؤالي")
-    ask_button.click(fn=answer_question, inputs=[question], outputs=[answer])
-demo.launch(share=True)

 import gradio as gr
+import tempfile
+import os
 import faiss
 import numpy as np
+from transformers import AutoTokenizer, AutoModel
 from sentence_transformers import SentenceTransformer
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pdfminer.high_level import extract_text
+import docx
+# Initialize global variables
+embedding_model = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-mix')
 index = None
 texts = []
+def extract_text_from_pdf(file_path):
+    try:
+        return extract_text(file_path)
+    except Exception as e:
+        print(f"Error extracting from PDF: {e}")
+        return ""
+def extract_text_from_docx(file_path):
+    try:
+        doc = docx.Document(file_path)
+        return "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        print(f"Error extracting from DOCX: {e}")
+        return ""
 def process_files(files, progress=gr.Progress()):
     global index, texts
+    if not files or len(files) == 0:
+        return "⚠️ لم يتم رفع أي ملفات. الرجاء رفع كتاب واحد على الأقل."
+    texts = []
     temp_dir = tempfile.mkdtemp()
+    try:
+        # Step 1: Extract text
+        progress(0.1, desc="جاري استخراج النصوص من الكتب...")
+        for file in files:
+            file_path = os.path.join(temp_dir, file.name)
+            with open(file_path, "wb") as f:
+                f.write(file.file.read())
+            if file.name.endswith(".pdf"):
+                text = extract_text_from_pdf(file_path)
+            elif file.name.endswith(".docx") or file.name.endswith(".doc"):
+                text = extract_text_from_docx(file_path)
+            else:
+                continue
+            if text:
+                texts.append(text)
+        if len(texts) == 0:
+            return "⚠️ لم يتم استخراج نصوص صالحة من الملفات."
+        # Step 2: Chunk the text
+        progress(0.4, desc="تقطيع النصوص إلى فقرات...")
+        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+        chunks = []
+        for text in texts:
+            chunks.extend(splitter.split_text(text))
+        if len(chunks) == 0:
+            return "⚠️ لا يوجد محتوى نصي كافٍ للتدريب."
+        # Step 3: Embed the text
+        progress(0.7, desc="تحويل الفقرات إلى متجهات...")
+        embeddings = embedding_model.encode(chunks, show_progress_bar=True)
+        # Step 4: Build FAISS index
+        progress(0.9, desc="بناء قاعدة بيانات البحث...")
+        embeddings = np.array(embeddings).astype(np.float32)
+        index = faiss.IndexFlatL2(embeddings.shape[1])
+        index.add(embeddings)
+        texts.clear()
+        texts.extend(chunks)
+        return "✅ النظام جاهز للإجابة على أسئلتك"
+    except Exception as e:
+        return f"❌ حدث خطأ أثناء التدريب: {str(e)}"
 def answer_question(question):
+    global index, texts
+    if index is None or len(texts) == 0:
+        return "⚠️ الرجاء رفع كتبك وتدريب النظام أولاً."
+    try:
+        question_embedding = embedding_model.encode([question])
+        question_embedding = np.array(question_embedding).astype(np.float32)
+        D, I = index.search(question_embedding, k=1)
+        if I[0][0] == -1:
+            return "❌ لم يتم العثور على إجابة."
+        retrieved_chunk = texts[I[0][0]]
+        return retrieved_chunk
+    except Exception as e:
+        return f"❌ حدث خطأ أثناء الإجابة: {str(e)}"
+with gr.Blocks() as demo:
+    gr.Markdown("# 📚 نظام محاكاة دماغ المؤلف العربي\nرفع كتبك ودرب النظام للإجابة على أسئلتك باللغة العربية فقط.")
     with gr.Row():
+        file_input = gr.File(label="📄 ارفع ملفات الكتب (PDF أو DOCX)", file_types=['.pdf', '.docx', '.doc'], file_count="multiple")
+    with gr.Row():
+        train_button = gr.Button("🚀 ابدأ التدريب على الكتب")
+    output_text = gr.Textbox(label="🔵 حالة التدريب")
+    with gr.Row():
+        question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا")
+        answer_output = gr.Textbox(label="🧠 إجابة النظام")
+    train_button.click(fn=process_files, inputs=[file_input], outputs=[output_text])
+    question_input.submit(fn=answer_question, inputs=[question_input], outputs=[answer_output])
+demo.launch()