ramysaidagieb commited on
Commit
66289a9
·
verified ·
1 Parent(s): d28c712

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -113
app.py CHANGED
@@ -1,122 +1,105 @@
1
  import gradio as gr
2
  import os
3
  import tempfile
 
 
4
  import faiss
5
- import torch
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain.prompts import PromptTemplate
10
- from langchain.chains import RetrievalQA
11
- from langchain.llms import HuggingFacePipeline
12
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
13
- from pdfminer.high_level import extract_text as extract_pdf_text
14
- import docx
15
- import nltk
16
-
17
- nltk.download('punkt')
18
- from nltk.tokenize import sent_tokenize
19
-
20
- uploaded_texts = []
21
- vector_store = None
22
- qa_chain = None
23
-
24
- embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
25
- embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
26
-
27
- model_name = "csebuetnlp/mT5_small"
28
- tokenizer = AutoTokenizer.from_pretrained(model_name)
29
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
30
-
31
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
32
- llm = HuggingFacePipeline(pipeline=pipe)
33
-
34
- ARABIC_PROMPT_TEMPLATE = """
35
- أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
36
- لا تستخدم أي معلومات خارجية.
37
- السؤال: {question}
38
- الإجابة:
39
- """
40
-
41
- def format_arabic_prompt(question):
42
- return ARABIC_PROMPT_TEMPLATE.format(question=question)
43
-
44
- def extract_text_from_file(file_path):
45
- if file_path.endswith(".pdf"):
46
- return extract_pdf_text(file_path)
47
- elif file_path.endswith(".docx") or file_path.endswith(".doc"):
48
- doc = docx.Document(file_path)
49
- return "\n".join([para.text for para in doc.paragraphs])
50
- else:
51
- raise ValueError("Unsupported file format")
52
-
53
- def arabic_split_text(text):
54
- sentences = sent_tokenize(text, language='arabic')
55
- chunks = []
56
- chunk = ""
57
- for sentence in sentences:
58
- if len(chunk) + len(sentence) <= 500:
59
- chunk += " " + sentence
60
  else:
61
- chunks.append(chunk.strip())
62
- chunk = sentence
63
- if chunk:
64
- chunks.append(chunk.strip())
65
- return chunks
66
-
67
- def train_from_texts(texts):
68
- global vector_store, qa_chain
69
-
70
- splitter = RecursiveCharacterTextSplitter(
71
- chunk_size=500,
72
- chunk_overlap=100,
73
- length_function=len,
74
- )
75
-
76
- all_chunks = []
77
  for text in texts:
78
- chunks = arabic_split_text(text)
79
- all_chunks.extend(chunks)
80
-
81
- vectors = embeddings.embed_documents(all_chunks)
82
- dimension = len(vectors[0])
83
- index = faiss.IndexFlatL2(dimension)
84
- vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)
85
-
86
- retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
87
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
88
-
89
- def upload_book(file, progress=gr.Progress()):
90
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
91
- tmp.write(file.read())
92
- tmp_path = tmp.name
93
-
94
- progress(0.2, desc="تحميل الملف...")
95
- extracted_text = extract_text_from_file(tmp_path)
96
- uploaded_texts.append(extracted_text)
97
- progress(0.5, desc="معالجة النص...")
98
-
99
- train_from_texts(uploaded_texts)
100
- progress(1.0, desc="اكتمل التدريب!")
101
- return "النظام جاهز للإجابة على أسئلتك"
102
-
103
- def answer_question(user_question):
104
- if qa_chain is None:
105
- return "الرجاء رفع كتاب أولاً."
106
- prompt = format_arabic_prompt(user_question)
107
- result = qa_chain.run(prompt)
108
- return result
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  with gr.Blocks() as demo:
111
- with gr.Tab("تحميل الكتب"):
112
- upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
113
- upload_output = gr.Textbox(label="حالة النظام")
114
- upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)
115
-
116
- with gr.Tab("اسأل الكتاب"):
117
- question = gr.Textbox(label="اكتب سؤالك بالعربية")
118
- answer = gr.Textbox(label="الإجابة")
119
- ask_button = gr.Button("إرسال السؤال")
120
- ask_button.click(answer_question, inputs=question, outputs=answer)
121
-
122
- demo.launch(share=True)
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
  import tempfile
4
+ import pdfminer.high_level
5
+ import docx2txt
6
  import faiss
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+ from sentence_transformers import SentenceTransformer
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+
12
+ # Load Arabic embedding model
13
+ embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
14
+
15
+ # FAISS index (vector store)
16
+ index = None
17
+ texts = []
18
+
19
+ # Function to extract text from PDF
20
+ def extract_text_from_pdf(pdf_path):
21
+ return pdfminer.high_level.extract_text(pdf_path)
22
+
23
+ # Function to extract text from DOCX
24
+ def extract_text_from_docx(docx_path):
25
+ return docx2txt.process(docx_path)
26
+
27
+ # Function to process uploaded files
28
+ def process_files(files, progress=gr.Progress()):
29
+ global index, texts
30
+ texts = []
31
+
32
+ temp_dir = tempfile.mkdtemp()
33
+
34
+ # Step 1: Extract text
35
+ progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
36
+ for file in files:
37
+ file_path = os.path.join(temp_dir, file.name)
38
+ with open(file_path, "wb") as f:
39
+ f.write(file.read())
40
+
41
+ if file.name.endswith(".pdf"):
42
+ text = extract_text_from_pdf(file_path)
43
+ elif file.name.endswith(".docx") or file.name.endswith(".doc"):
44
+ text = extract_text_from_docx(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  else:
46
+ continue
47
+
48
+ texts.append(text)
49
+
50
+ # Step 2: Chunk the text
51
+ progress(0.4, desc="تقطيع النصوص إلى فقرات...")
52
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
53
+ chunks = []
 
 
 
 
 
 
 
 
54
  for text in texts:
55
+ chunks.extend(splitter.split_text(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Step 3: Embed the text
58
+ progress(0.7, desc="تحويل الفقرات إلى متجهات...")
59
+ embeddings = embedding_model.encode(chunks, show_progress_bar=True)
60
+
61
+ # Step 4: Build FAISS index
62
+ progress(0.9, desc="بناء قاعدة بيانات البحث...")
63
+ index = faiss.IndexFlatL2(embeddings.shape[1])
64
+ index.add(np.array(embeddings))
65
+ texts.clear()
66
+ texts.extend(chunks)
67
+
68
+ return "✅ النظام جاهز للإجابة على أسئلتك"
69
+
70
+ # Function to answer Arabic questions
71
+ def answer_question(question):
72
+ global index, texts
73
+
74
+ if index is None or len(texts) == 0:
75
+ return "❗ من فضلك قم بتحميل الكتب أولاً."
76
+
77
+ # Embed the question
78
+ question_embedding = embedding_model.encode([question])
79
+
80
+ # Search in FAISS
81
+ distances, indices = index.search(np.array(question_embedding), k=5)
82
+ retrieved_chunks = [texts[i] for i in indices[0]]
83
+
84
+ # Simple answer: concatenate most relevant chunks
85
+ answer = "\n".join(retrieved_chunks)
86
+ return answer
87
+
88
+ # Gradio UI
89
  with gr.Blocks() as demo:
90
+ gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
91
+
92
+ with gr.Tab("رفع الكتب"):
93
+ upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
94
+ train_button = gr.Button("ابدأ التدريب على الكتب")
95
+ training_output = gr.Textbox(label="حالة التدريب")
96
+
97
+ with gr.Tab("اسأل الكتب"):
98
+ question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
99
+ answer_output = gr.Textbox(label="الإجابة")
100
+ ask_button = gr.Button("أرسل السؤال")
101
+
102
+ train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
103
+ ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])
104
+
105
+ demo.launch(share=True)