ramysaidagieb commited on
Commit
8876843
·
verified ·
1 Parent(s): ea67549

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -48
app.py CHANGED
@@ -1,30 +1,30 @@
1
- import gradio as gr
 
 
 
2
  import os
3
  import tempfile
4
- import pdfminer.high_level
5
- import docx2txt
6
  import faiss
7
  import numpy as np
8
- from tqdm import tqdm
9
- from sentence_transformers import SentenceTransformer
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
11
 
12
  # Load Arabic embedding model
13
- embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
14
-
15
- # FAISS index (vector store)
16
  index = None
17
  texts = []
18
 
19
- # Function to extract text from PDF
20
  def extract_text_from_pdf(pdf_path):
21
- return pdfminer.high_level.extract_text(pdf_path)
22
 
23
- # Function to extract text from DOCX
24
  def extract_text_from_docx(docx_path):
25
- return docx2txt.process(docx_path)
 
26
 
27
- # Function to process uploaded files
28
  def process_files(files, progress=gr.Progress()):
29
  global index, texts
30
  texts = []
@@ -32,11 +32,11 @@ def process_files(files, progress=gr.Progress()):
32
  temp_dir = tempfile.mkdtemp()
33
 
34
  # Step 1: Extract text
35
- progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
36
  for file in files:
37
  file_path = os.path.join(temp_dir, file.name)
38
  with open(file_path, "wb") as f:
39
- f.write(file.read())
40
 
41
  if file.name.endswith(".pdf"):
42
  text = extract_text_from_pdf(file_path)
@@ -48,58 +48,54 @@ def process_files(files, progress=gr.Progress()):
48
  texts.append(text)
49
 
50
  # Step 2: Chunk the text
51
- progress(0.4, desc="تقطيع النصوص إلى فقرات...")
52
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
53
  chunks = []
54
  for text in texts:
55
  chunks.extend(splitter.split_text(text))
56
 
57
  # Step 3: Embed the text
58
- progress(0.7, desc="تحويل الفقرات إلى متجهات...")
59
  embeddings = embedding_model.encode(chunks, show_progress_bar=True)
60
 
61
  # Step 4: Build FAISS index
62
- progress(0.9, desc="بناء قاعدة بيانات البحث...")
 
63
  index = faiss.IndexFlatL2(embeddings.shape[1])
64
- index.add(np.array(embeddings))
65
  texts.clear()
66
  texts.extend(chunks)
67
 
68
- return " النظام جاهز للإجابة على أسئلتك"
69
 
70
- # Function to answer Arabic questions
71
  def answer_question(question):
72
- global index, texts
 
73
 
74
- if index is None or len(texts) == 0:
75
- return "❗ من فضلك قم بتحميل الكتب أولاً."
 
 
76
 
77
- # Embed the question
78
- question_embedding = embedding_model.encode([question])
79
 
80
- # Search in FAISS
81
- distances, indices = index.search(np.array(question_embedding), k=5)
82
- retrieved_chunks = [texts[i] for i in indices[0]]
83
 
84
- # Simple answer: concatenate most relevant chunks
85
- answer = "\n".join(retrieved_chunks)
86
- return answer
 
 
 
 
 
 
 
 
 
87
 
88
- # Gradio UI
89
- with gr.Blocks() as demo:
90
- gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
91
-
92
- with gr.Tab("رفع الكتب"):
93
- upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
94
- train_button = gr.Button("ابدأ التدريب على الكتب")
95
- training_output = gr.Textbox(label="حالة التدريب")
96
-
97
- with gr.Tab("اسأل الكتب"):
98
- question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
99
- answer_output = gr.Textbox(label="الإجابة")
100
- ask_button = gr.Button("أرسل السؤال")
101
-
102
- train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
103
- ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])
104
 
105
  demo.launch(share=True)
 
1
+ # Creating your fully corrected Hugging Face Space project
2
+
3
+ # app.py
4
+
5
  import os
6
  import tempfile
7
+ import gradio as gr
 
8
  import faiss
9
  import numpy as np
10
+ from transformers import AutoModel, AutoTokenizer
 
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from sentence_transformers import SentenceTransformer
13
+ from pdfminer.high_level import extract_text
14
+ from docx import Document
15
 
16
  # Load Arabic embedding model
17
+ embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 
 
18
  index = None
19
  texts = []
20
 
 
21
  def extract_text_from_pdf(pdf_path):
22
+ return extract_text(pdf_path)
23
 
 
24
  def extract_text_from_docx(docx_path):
25
+ doc = Document(docx_path)
26
+ return "\n".join([para.text for para in doc.paragraphs])
27
 
 
28
  def process_files(files, progress=gr.Progress()):
29
  global index, texts
30
  texts = []
 
32
  temp_dir = tempfile.mkdtemp()
33
 
34
  # Step 1: Extract text
35
+ progress(0.1, desc="\u062c\u0627\u0631\u0650 \u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0646\u0635\u0648\u0635 \u0645\u0646 \u0627\u0644\u0643\u062a\u0628...")
36
  for file in files:
37
  file_path = os.path.join(temp_dir, file.name)
38
  with open(file_path, "wb") as f:
39
+ f.write(file.file.read())
40
 
41
  if file.name.endswith(".pdf"):
42
  text = extract_text_from_pdf(file_path)
 
48
  texts.append(text)
49
 
50
  # Step 2: Chunk the text
51
+ progress(0.4, desc="\u062a\u0642\u0637\u064a\u0639 \u0627\u0644\u0646\u0635\u0648\u0635 \u0625\u0644\u0649 \u0641\u0642\u0631\u0627\u062a...")
52
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
53
  chunks = []
54
  for text in texts:
55
  chunks.extend(splitter.split_text(text))
56
 
57
  # Step 3: Embed the text
58
+ progress(0.7, desc="\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0641\u0642\u0631\u0627\u062a \u0625\u0644\u0649 \u0645\u062a\u062c\u0647\u0627\u062a...")
59
  embeddings = embedding_model.encode(chunks, show_progress_bar=True)
60
 
61
  # Step 4: Build FAISS index
62
+ progress(0.9, desc="\u0628\u0646\u0627\u0621 \u0642\u0627\u0639\u062f\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0628\u062d\u062b...")
63
+ embeddings = np.array(embeddings).astype(np.float32)
64
  index = faiss.IndexFlatL2(embeddings.shape[1])
65
+ index.add(embeddings)
66
  texts.clear()
67
  texts.extend(chunks)
68
 
69
+ return "\u2705 \u0627\u0644\u0646\u0638\u0627\u0645 \u062c\u0627\u0647\u0632 \u0644\u0644\u0625\u062c\u0627\u0628\u0629 \u0639\u0644\u0649 \u0623\u0633\u0626\u0644\u062a\u0643"
70
 
 
71
  def answer_question(question):
72
+ if index is None:
73
+ return "\u064a\u0631\u062c\u0649 \u062a\u062d\u0645\u064a\u0644 \u0643\u062a\u0628 \u0648\u0627\u0644\u0646\u0642\u0631 \u0639\u0644\u0649 \"\u0627\u0628\u062f\u0623 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\" \u0623\u0648\u0644\u0627"
74
 
75
+ embedded_question = embedding_model.encode([question]).astype(np.float32)
76
+ D, I = index.search(embedded_question, k=1)
77
+ if len(I[0]) == 0:
78
+ return "\u0644\u0645 \u064a\u062a\u0645 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0625\u062c\u0627\u0628\u0629."
79
 
80
+ answer = texts[I[0][0]]
81
+ return answer
82
 
83
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
84
+ gr.Markdown("# \ud83d\udcda محاكاة دماغ المؤلف بناءً على الكتب المرفوعة")
 
85
 
86
+ with gr.Row():
87
+ files = gr.File(label="ارفع ملفات الكتب", file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
88
+ upload_button = gr.Button("ابدأ التدريب على الكتب")
89
+
90
+ output_text = gr.Textbox(label="مخرجات التدريب", interactive=False)
91
+
92
+ upload_button.click(fn=process_files, inputs=[files], outputs=[output_text])
93
+
94
+ gr.Markdown("## اطرح سؤالك بعد إكمال التدريب:")
95
+ question = gr.Textbox(label="سؤالك بالعربية")
96
+ answer = gr.Textbox(label="الإجابة", interactive=False)
97
+ ask_button = gr.Button("أجب عن سؤالي")
98
 
99
+ ask_button.click(fn=answer_question, inputs=[question], outputs=[answer])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  demo.launch(share=True)