ramysaidagieb commited on
Commit
6d2438e
·
verified ·
1 Parent(s): 0ffbfee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -65
app.py CHANGED
@@ -1,91 +1,110 @@
1
- # app.py
2
  import os
3
  import shutil
4
- import chromadb
5
- import gradio as gr
6
- from ctransformers import AutoModelForCausalLM
7
- from langchain.embeddings import SentenceTransformerEmbeddings
8
- from langchain.vectorstores import Chroma
9
  from langchain.chains import RetrievalQA
10
- from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
 
 
 
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
- from langchain.llms import CTransformers
 
 
 
13
 
14
- # 1. إعداد نموذج LLM بدون API باستخدام ctransformers
15
- llm = AutoModelForCausalLM.from_pretrained(
16
- model_path_or_repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
17
  model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
18
  model_type="mistral",
19
- config={"max_new_tokens": 512, "temperature": 0.7}
 
 
 
 
 
20
  )
21
 
22
- # 2. إعداد مجلد التخزين
23
- CHROMA_DIR = "chroma_store"
24
- if os.path.exists(CHROMA_DIR):
25
- shutil.rmtree(CHROMA_DIR)
26
-
27
- # 3. تحميل الملفات وتقسيمها
28
- SUPPORTED_TYPES = {"pdf": PyPDFLoader, "docx": Docx2txtLoader, "txt": TextLoader}
29
 
30
- def load_documents(file_paths):
 
31
  documents = []
32
- for path in file_paths:
33
- ext = path.split(".")[-1].lower()
34
- loader_class = SUPPORTED_TYPES.get(ext)
35
- if loader_class:
36
- loader = loader_class(path)
37
- docs = loader.load()
38
- documents.extend(docs)
 
 
 
 
 
39
  return documents
40
 
41
- # 4. تقسيم النصوص وإنشاء المتجهات
42
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
43
- embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
44
-
45
  def create_vectorstore(docs):
46
- texts = text_splitter.split_documents(docs)
47
- return Chroma.from_documents(texts, embedding, persist_directory=CHROMA_DIR)
 
48
 
49
- # 5. واجهة Gradio
50
- uploaded_files = []
51
- db = None
52
- qa_chain = None
53
 
54
- def upload_files(files):
55
- global uploaded_files, db, qa_chain
56
- uploaded_paths = [f.name for f in files]
57
- uploaded_files.extend(uploaded_paths)
58
- docs = load_documents(uploaded_paths)
59
- db = create_vectorstore(docs)
60
- retriever = db.as_retriever(search_kwargs={"k": 5})
61
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
62
- return "تم تحميل الملفات وبناء قاعدة المعرفة بنجاح ✅"
63
 
64
- def answer_question_arabic(question):
65
- if not qa_chain:
66
- return "من فضلك قم أولاً بتحميل ملفاتك وبناء قاعدة المعرفة."
67
- result = qa_chain.run(question)
68
- return result
69
 
70
- with gr.Blocks(theme=gr.themes.Soft(), rtl=True, title="Smart PDF Assistant") as demo:
71
- gr.Markdown("""
72
- # 🤖 مساعد الوثائق الذكي باللغة العربية
73
- أرفق ملفاتك (PDF، DOCX، TXT) ثم اسأل أي سؤال.
74
- """)
 
 
75
 
76
- with gr.Row():
77
- file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], file_count="multiple", label="📁 ارفع ملفاتك")
78
- upload_button = gr.Button("تحميل الملفات")
 
 
 
 
 
 
 
 
 
79
 
80
- status_output = gr.Textbox(label="الحالة")
 
 
 
 
 
81
 
 
 
 
82
  with gr.Row():
83
- question_input = gr.Textbox(lines=2, placeholder="✍️ اكتب سؤالك هنا", label="السؤال")
84
- answer_button = gr.Button("أرسل")
85
- answer_output = gr.Textbox(label="الإجابة", lines=5)
 
86
 
87
- upload_button.click(upload_files, inputs=[file_input], outputs=[status_output])
88
- answer_button.click(answer_question_arabic, inputs=[question_input], outputs=[answer_output])
89
 
 
90
  if __name__ == "__main__":
91
  demo.launch()
 
 
1
  import os
2
  import shutil
3
+ import tempfile
4
+ from langchain_community.llms import CTransformers
 
 
 
5
  from langchain.chains import RetrievalQA
6
+ from langchain.prompts import PromptTemplate
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.embeddings import SentenceTransformerEmbeddings
9
+ from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from fastapi import FastAPI, UploadFile, File
12
+ from fastapi.responses import JSONResponse
13
+ import uvicorn
14
+ import gradio as gr
15
 
16
+ # إعداد نموذج اللغة
17
+ llm = CTransformers(
18
+ model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
19
  model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
20
  model_type="mistral",
21
+ config={
22
+ "max_new_tokens": 512,
23
+ "temperature": 0.7,
24
+ "context_length": 4096,
25
+ "gpu_layers": 20,
26
+ }
27
  )
28
 
29
+ # إعداد المطابقة الدلالية
30
+ embedding_function = SentenceTransformerEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 
 
 
 
 
31
 
32
+ # تحميل المستندات وإنشاء قاعدة معرفية
33
+ def load_documents_from_folder(folder_path):
34
  documents = []
35
+ for filename in os.listdir(folder_path):
36
+ full_path = os.path.join(folder_path, filename)
37
+ if filename.endswith(".pdf"):
38
+ loader = PyPDFLoader(full_path)
39
+ elif filename.endswith(".docx"):
40
+ loader = Docx2txtLoader(full_path)
41
+ elif filename.endswith(".txt"):
42
+ loader = TextLoader(full_path)
43
+ else:
44
+ continue
45
+ docs = loader.load()
46
+ documents.extend(docs)
47
  return documents
48
 
 
 
 
 
49
  def create_vectorstore(docs):
50
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
51
+ chunks = splitter.split_documents(docs)
52
+ return Chroma.from_documents(chunks, embedding_function)
53
 
54
+ # إعداد واجهة الإجابة
55
+ retriever = None
56
+ qa = None
 
57
 
58
+ def setup_qa(folder_path):
59
+ global retriever, qa
60
+ docs = load_documents_from_folder(folder_path)
61
+ vectordb = create_vectorstore(docs)
62
+ retriever = vectordb.as_retriever()
 
 
 
 
63
 
64
+ prompt_template = """
65
+ أنت مساعد ذكي تجيب باللغة العربية، تستند فقط إلى محتوى الوثائق المقدمة.
66
+ لا تقم بإضافة أي معلومات من عندك.
 
 
67
 
68
+ السؤال: {question}
69
+ =========
70
+ الوثائق:
71
+ {context}
72
+ =========
73
+ الإجاب�� المفصلة باللغة العربية:
74
+ """
75
 
76
+ prompt = PromptTemplate(template=prompt_template, input_variables=["question", "context"])
77
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt})
78
+
79
+ # تحميل الملفات من Gradio
80
+ def process_uploaded_files(files):
81
+ temp_dir = tempfile.mkdtemp()
82
+ for file in files:
83
+ dest_path = os.path.join(temp_dir, file.name)
84
+ with open(dest_path, "wb") as f:
85
+ f.write(file.read())
86
+ setup_qa(temp_dir)
87
+ shutil.rmtree(temp_dir)
88
 
89
+ # الإجابة على الأسئلة
90
+ def answer_question(question):
91
+ if qa is None:
92
+ return "الرجاء رفع ملفاتك أولًا."
93
+ response = qa.run(question)
94
+ return response
95
 
96
+ # واجهة Gradio
97
+ with gr.Blocks(css=".gradio-container { direction: rtl; text-align: right; font-family: 'Cairo', sans-serif; }") as demo:
98
+ gr.Markdown("## مساعد الوثائق الذكي")
99
  with gr.Row():
100
+ file_input = gr.File(file_types=[".pdf", ".docx", ".txt"], file_count="multiple", label="ارفع ملفاتك")
101
+ load_button = gr.Button("ابدأ التحليل")
102
+ question_input = gr.Textbox(label="اكتب سؤالك هنا")
103
+ answer_output = gr.Textbox(label="الإجابة")
104
 
105
+ load_button.click(fn=process_uploaded_files, inputs=[file_input], outputs=[])
106
+ question_input.submit(fn=answer_question, inputs=[question_input], outputs=[answer_output])
107
 
108
+ # تشغيل Gradio
109
  if __name__ == "__main__":
110
  demo.launch()