ramysaidagieb commited on
Commit
5ef2861
·
verified ·
1 Parent(s): 54dcc54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -74
app.py CHANGED
@@ -1,101 +1,125 @@
1
- # Creating your fully corrected Hugging Face Space project
2
-
3
- # app.py
4
-
5
- import os
6
- import tempfile
7
  import gradio as gr
 
 
8
  import faiss
9
  import numpy as np
10
- from transformers import AutoModel, AutoTokenizer
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from sentence_transformers import SentenceTransformer
 
13
  from pdfminer.high_level import extract_text
14
- from docx import Document
15
 
16
- # Load Arabic embedding model
17
- embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
18
  index = None
19
  texts = []
20
 
21
- def extract_text_from_pdf(pdf_path):
22
- return extract_text(pdf_path)
23
-
24
- def extract_text_from_docx(docx_path):
25
- doc = Document(docx_path)
26
- return "\n".join([para.text for para in doc.paragraphs])
 
 
 
 
 
 
 
 
27
 
28
  def process_files(files, progress=gr.Progress()):
29
  global index, texts
30
- texts = []
31
 
 
 
 
 
32
  temp_dir = tempfile.mkdtemp()
33
 
34
- # Step 1: Extract text
35
- progress(0.1, desc="\u062c\u0627\u0631\u0650 \u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0646\u0635\u0648\u0635 \u0645\u0646 \u0627\u0644\u0643\u062a\u0628...")
36
- for file in files:
37
- file_path = os.path.join(temp_dir, file.name)
38
- with open(file_path, "wb") as f:
39
- f.write(file.file.read())
40
-
41
- if file.name.endswith(".pdf"):
42
- text = extract_text_from_pdf(file_path)
43
- elif file.name.endswith(".docx") or file.name.endswith(".doc"):
44
- text = extract_text_from_docx(file_path)
45
- else:
46
- continue
47
-
48
- texts.append(text)
49
-
50
- # Step 2: Chunk the text
51
- progress(0.4, desc="\u062a\u0642\u0637\u064a\u0639 \u0627\u0644\u0646\u0635\u0648\u0635 \u0625\u0644\u0649 \u0641\u0642\u0631\u0627\u062a...")
52
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
53
- chunks = []
54
- for text in texts:
55
- chunks.extend(splitter.split_text(text))
56
-
57
- # Step 3: Embed the text
58
- progress(0.7, desc="\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0641\u0642\u0631\u0627\u062a \u0625\u0644\u0649 \u0645\u062a\u062c\u0647\u0627\u062a...")
59
- embeddings = embedding_model.encode(chunks, show_progress_bar=True)
60
-
61
- # Step 4: Build FAISS index
62
- progress(0.9, desc="\u0628\u0646\u0627\u0621 \u0642\u0627\u0639\u062f\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0628\u062d\u062b...")
63
- embeddings = np.array(embeddings).astype(np.float32)
64
- index = faiss.IndexFlatL2(embeddings.shape[1])
65
- index.add(embeddings)
66
- texts.clear()
67
- texts.extend(chunks)
68
-
69
- return "\u2705 \u0627\u0644\u0646\u0638\u0627\u0645 \u062c\u0627\u0647\u0632 \u0644\u0644\u0625\u062c\u0627\u0628\u0629 \u0639\u0644\u0649 \u0623\u0633\u0626\u0644\u062a\u0643"
 
 
 
 
 
 
 
 
 
 
70
 
71
  def answer_question(question):
72
- if index is None:
73
- return "\u064a\u0631\u062c\u0649 \u062a\u062d\u0645\u064a\u0644 \u0643\u062a\u0628 \u0648\u0627\u0644\u0646\u0642\u0631 \u0639\u0644\u0649 \"\u0627\u0628\u062f\u0623 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\" \u0623\u0648\u0644\u0627"
74
 
75
- embedded_question = embedding_model.encode([question]).astype(np.float32)
76
- D, I = index.search(embedded_question, k=1)
77
- if len(I[0]) == 0:
78
- return "\u0644\u0645 \u064a\u062a\u0645 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0625\u062c\u0627\u0628\u0629."
79
 
80
- answer = texts[I[0][0]]
81
- return answer
 
82
 
83
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
84
- gr.Markdown("# \ud83d\udcda محاكاة دماغ المؤلف بناءً على الكتب المرفوعة")
 
 
 
 
 
 
 
 
 
85
 
86
  with gr.Row():
87
- files = gr.File(label="ارفع ملفات الكتب", file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
88
- upload_button = gr.Button("ابدأ التدريب على الكتب")
89
 
90
- output_text = gr.Textbox(label="مخرجات التدريب", interactive=False)
 
91
 
92
- upload_button.click(fn=process_files, inputs=[files], outputs=[output_text])
93
 
94
- gr.Markdown("## اطرح سؤالك بعد إكمال التدريب:")
95
- question = gr.Textbox(label="سؤالك بالعربية")
96
- answer = gr.Textbox(label="الإجابة", interactive=False)
97
- ask_button = gr.Button("أجب عن سؤالي")
98
 
99
- ask_button.click(fn=answer_question, inputs=[question], outputs=[answer])
 
100
 
101
- demo.launch(share=True)
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import tempfile
3
+ import os
4
  import faiss
5
  import numpy as np
6
+ from transformers import AutoTokenizer, AutoModel
 
7
  from sentence_transformers import SentenceTransformer
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from pdfminer.high_level import extract_text
10
+ import docx
11
 
12
+ # Initialize global variables
13
+ embedding_model = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-mix')
14
  index = None
15
  texts = []
16
 
17
+ def extract_text_from_pdf(file_path):
18
+ try:
19
+ return extract_text(file_path)
20
+ except Exception as e:
21
+ print(f"Error extracting from PDF: {e}")
22
+ return ""
23
+
24
+ def extract_text_from_docx(file_path):
25
+ try:
26
+ doc = docx.Document(file_path)
27
+ return "\n".join([para.text for para in doc.paragraphs])
28
+ except Exception as e:
29
+ print(f"Error extracting from DOCX: {e}")
30
+ return ""
31
 
32
  def process_files(files, progress=gr.Progress()):
33
  global index, texts
 
34
 
35
+ if not files or len(files) == 0:
36
+ return "⚠️ لم يتم رفع أي ملفات. الرجاء رفع كتاب واحد على الأقل."
37
+
38
+ texts = []
39
  temp_dir = tempfile.mkdtemp()
40
 
41
+ try:
42
+ # Step 1: Extract text
43
+ progress(0.1, desc="جاري استخراج النصوص من الكتب...")
44
+ for file in files:
45
+ file_path = os.path.join(temp_dir, file.name)
46
+ with open(file_path, "wb") as f:
47
+ f.write(file.file.read())
48
+
49
+ if file.name.endswith(".pdf"):
50
+ text = extract_text_from_pdf(file_path)
51
+ elif file.name.endswith(".docx") or file.name.endswith(".doc"):
52
+ text = extract_text_from_docx(file_path)
53
+ else:
54
+ continue
55
+
56
+ if text:
57
+ texts.append(text)
58
+
59
+ if len(texts) == 0:
60
+ return "⚠️ لم يتم استخراج نصوص صالحة من الملفات."
61
+
62
+ # Step 2: Chunk the text
63
+ progress(0.4, desc="تقطيع النصوص إلى فقرات...")
64
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
65
+ chunks = []
66
+ for text in texts:
67
+ chunks.extend(splitter.split_text(text))
68
+
69
+ if len(chunks) == 0:
70
+ return "⚠️ لا يوجد محتوى نصي كافٍ للتدريب."
71
+
72
+ # Step 3: Embed the text
73
+ progress(0.7, desc="تحويل الفقرات إلى متجهات...")
74
+ embeddings = embedding_model.encode(chunks, show_progress_bar=True)
75
+
76
+ # Step 4: Build FAISS index
77
+ progress(0.9, desc="بناء قاعدة بيانات البحث...")
78
+ embeddings = np.array(embeddings).astype(np.float32)
79
+ index = faiss.IndexFlatL2(embeddings.shape[1])
80
+ index.add(embeddings)
81
+ texts.clear()
82
+ texts.extend(chunks)
83
+
84
+ return "✅ النظام جاهز للإجابة على أسئلتك"
85
+ except Exception as e:
86
+ return f"❌ حدث خطأ أثناء التدريب: {str(e)}"
87
 
88
  def answer_question(question):
89
+ global index, texts
 
90
 
91
+ if index is None or len(texts) == 0:
92
+ return "⚠️ الرجاء رفع كتبك وتدريب النظام أولاً."
 
 
93
 
94
+ try:
95
+ question_embedding = embedding_model.encode([question])
96
+ question_embedding = np.array(question_embedding).astype(np.float32)
97
 
98
+ D, I = index.search(question_embedding, k=1)
99
+ if I[0][0] == -1:
100
+ return "❌ لم يتم العثور على إجابة."
101
+
102
+ retrieved_chunk = texts[I[0][0]]
103
+ return retrieved_chunk
104
+ except Exception as e:
105
+ return f"❌ حدث خطأ أثناء الإجابة: {str(e)}"
106
+
107
+ with gr.Blocks() as demo:
108
+ gr.Markdown("# 📚 نظام محاكاة دماغ المؤلف العربي\nرفع كتبك ودرب النظام للإجابة على أسئلتك باللغة العربية فقط.")
109
 
110
  with gr.Row():
111
+ file_input = gr.File(label="📄 ارفع ملفات الكتب (PDF أو DOCX)", file_types=['.pdf', '.docx', '.doc'], file_count="multiple")
 
112
 
113
+ with gr.Row():
114
+ train_button = gr.Button("🚀 ابدأ التدريب على الكتب")
115
 
116
+ output_text = gr.Textbox(label="🔵 حالة التدريب")
117
 
118
+ with gr.Row():
119
+ question_input = gr.Textbox(label="✍️ اكتب سؤالك هنا")
120
+ answer_output = gr.Textbox(label="🧠 إجابة النظام")
 
121
 
122
+ train_button.click(fn=process_files, inputs=[file_input], outputs=[output_text])
123
+ question_input.submit(fn=answer_question, inputs=[question_input], outputs=[answer_output])
124
 
125
+ demo.launch()