ramysaidagieb commited on
Commit
b13872f
·
verified ·
1 Parent(s): 4e3a79b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -61
app.py CHANGED
@@ -1,81 +1,64 @@
1
- from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from langchain.embeddings import SentenceTransformerEmbeddings
4
  from langchain.vectorstores import Chroma
 
 
5
  from langchain.chains import RetrievalQA
6
- from ctransformers import AutoModelForCausalLM
7
- import gradio as gr
8
- import os
9
- import tempfile
10
 
11
- # Load the model (CPU-only)
12
- llm = AutoModelForCausalLM.from_pretrained(
13
- model_path_or_repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
14
  model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
15
  model_type="mistral",
16
- config={
17
- 'max_new_tokens': 512,
18
- 'temperature': 0.5,
19
- 'gpu_layers': 0 # Disable GPU
20
- }
21
  )
22
 
23
- # Embedding model
24
- embedding_function = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
-
26
- # Temp folder for uploading documents
27
- persist_directory = tempfile.mkdtemp()
28
-
29
- def load_file(file):
30
- ext = os.path.splitext(file.name)[1].lower()
31
- if ext == ".pdf":
32
- loader = PyPDFLoader(file.name)
33
- elif ext == ".docx":
34
- loader = Docx2txtLoader(file.name)
35
- elif ext == ".txt":
36
- loader = TextLoader(file.name)
37
  else:
38
- return None
39
  return loader.load()
40
 
41
- def process_document(file):
42
- docs = load_file(file)
43
- if docs is None:
44
- return "صيغة غير مدعومة."
45
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
46
- texts = text_splitter.split_documents(docs)
47
- vectordb = Chroma.from_documents(texts, embedding_function, persist_directory=persist_directory)
48
- retriever = vectordb.as_retriever(search_kwargs={"k": 3})
49
- qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
50
  return qa_chain
51
 
52
  qa_chain = None
53
 
54
- def upload_file(file):
 
55
  global qa_chain
56
- qa_chain = process_document(file)
57
- return "تم رفع الملف ومعالجته بنجاح. يمكنك الآن طرح سؤالك."
58
-
59
- def answer_question(question):
60
  if qa_chain is None:
61
- return "يرجى رفع ملف أولاً."
62
- result = qa_chain({"query": question})
63
- answer = result["result"]
64
- sources = "\n\n".join([doc.page_content[:200] for doc in result["source_documents"]])
65
- return f"🧠 الإجابة:\n{answer}\n\n📚 المراجع:\n{sources}"
66
 
67
- with gr.Blocks() as demo:
68
- gr.Markdown("# 📄 Smart PDF Assistant\nنظام سؤال وجواب من ملفات PDF وورد ونصوص")
69
- with gr.Row():
70
- file_upload = gr.File(label="📂 ارفع مستند", type="file")
71
- upload_button = gr.Button("معالجة الملف")
72
- output = gr.Textbox(label="✅ الحالة")
73
- with gr.Row():
74
- question = gr.Textbox(label="✍️ اكتب سؤالك هنا")
75
- answer = gr.Button("📤 إرسال")
76
- response = gr.Textbox(label="🤖 الإجابة", lines=10)
77
 
78
- upload_button.click(fn=upload_file, inputs=file_upload, outputs=output)
79
- answer.click(fn=answer_question, inputs=question, outputs=response)
80
 
81
  demo.launch()
 
1
+ import os
2
+ import gradio as gr
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
  from langchain.vectorstores import Chroma
5
+ from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.chains import RetrievalQA
8
+ from langchain.llms import CTransformers
9
+
10
+ # احصل على التوكن من Secrets
11
+ HF_TOKEN = os.getenv("HF_TOKEN")
12
 
13
+ # تحميل النموذج محليًا
14
+ llm = CTransformers(
15
+ model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
16
  model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
17
  model_type="mistral",
18
+ hf_token=HF_TOKEN,
19
+ config={"max_new_tokens": 512, "temperature": 0.7}
 
 
 
20
  )
21
 
22
+ # التحميل الدلالي للنصوص
23
+ def load_documents(file_path):
24
+ if file_path.endswith(".pdf"):
25
+ loader = PyPDFLoader(file_path)
26
+ elif file_path.endswith(".txt"):
27
+ loader = TextLoader(file_path, encoding='utf-8')
28
+ elif file_path.endswith(".docx"):
29
+ loader = Docx2txtLoader(file_path)
 
 
 
 
 
 
30
  else:
31
+ raise ValueError("Unsupported file type.")
32
  return loader.load()
33
 
34
+ # تجهيز المستند
35
+ def process_file(file):
36
+ docs = load_documents(file.name)
37
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
38
+ chunks = splitter.split_documents(docs)
39
+ embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
40
+ vectordb = Chroma.from_documents(chunks, embedding)
41
+ retriever = vectordb.as_retriever()
42
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
43
  return qa_chain
44
 
45
  qa_chain = None
46
 
47
+ # واجهة Gradio
48
+ def ask_question(file, question):
49
  global qa_chain
 
 
 
 
50
  if qa_chain is None:
51
+ qa_chain = process_file(file)
52
+ answer = qa_chain.run(question)
53
+ return f"<div dir='rtl' style='text-align: right;'>{answer}</div>"
 
 
54
 
55
+ with gr.Blocks(css="body {direction: rtl; text-align: right;}") as demo:
56
+ gr.Markdown("## مساعد الوثائق الذكي - استعلام باللغة العربية من ملفاتك")
57
+ file_input = gr.File(label="📄 حمّل ملفًا (PDF / DOCX / TXT)", file_types=[".pdf", ".txt", ".docx"])
58
+ question_input = gr.Textbox(label=" أدخل سؤالك بالعربية", placeholder="ما هو موضوع هذا الملف؟")
59
+ output = gr.HTML()
 
 
 
 
 
60
 
61
+ submit_btn = gr.Button("🔍 استعلم")
62
+ submit_btn.click(fn=ask_question, inputs=[file_input, question_input], outputs=output)
63
 
64
  demo.launch()