ramysaidagieb commited on
Commit
6750035
·
verified ·
1 Parent(s): 8237e9a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +19 -0
  2. rag_pipeline.py +44 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_pipeline import load_rag_chain
3
+
4
+ rag_chain = load_rag_chain()
5
+
6
+ def ask_question(query):
7
+ result = rag_chain.invoke(query)
8
+ return result['result']
9
+
10
+ iface = gr.Interface(
11
+ fn=ask_question,
12
+ inputs=gr.Textbox(lines=3, placeholder="اكتب سؤالك هنا...", label="سؤالك", rtl=True),
13
+ outputs=gr.Textbox(label="الإجابة", rtl=True),
14
+ title="🧠 روبوت دردشة عربي باستخدام ملفات PDF",
15
+ description="اكتب سؤالك بالعربية (يدعم اللهجة المصرية)، وسنبحث عن الإجابة داخل الملفات التي قمت بتحميلها."
16
+ )
17
+
18
+ if __name__ == "__main__":
19
+ iface.launch()
rag_pipeline.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from langchain.chains import RetrievalQA
3
+ from transformers import pipeline, AutoTokenizer
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
8
+
9
+ def load_documents(pdf_dir):
10
+ docs = []
11
+ for pdf_file in Path(pdf_dir).glob("*.pdf"):
12
+ loader = PyMuPDFLoader(str(pdf_file))
13
+ docs.extend(loader.load())
14
+ return docs
15
+
16
+ def load_rag_chain():
17
+ pdf_dir = Path("data")
18
+ pdf_dir.mkdir(parents=True, exist_ok=True)
19
+
20
+ raw_docs = load_documents(pdf_dir)
21
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
22
+ pages = splitter.split_documents(raw_docs)
23
+
24
+ embeddings = HuggingFaceEmbeddings(
25
+ model_name="sentence-transformers/LaBSE",
26
+ model_kwargs={"device": "cpu"},
27
+ )
28
+
29
+ vectordb_dir = "chroma_db"
30
+ vectordb = Chroma.from_documents(pages, embeddings, persist_directory=vectordb_dir)
31
+ retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})
32
+
33
+ hf_pipeline = pipeline(
34
+ "text2text-generation",
35
+ model="csebuetnlp/mT5_small_finetuned_squad",
36
+ tokenizer=AutoTokenizer.from_pretrained("csebuetnlp/mT5_small_finetuned_squad"),
37
+ max_new_tokens=512,
38
+ temperature=0.3,
39
+ device=-1
40
+ )
41
+ llm = HuggingFacePipeline(pipeline=hf_pipeline)
42
+
43
+ qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever)
44
+ return qa_chain
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ langchain
5
+ langchain-community
6
+ langchain-huggingface
7
+ chromadb
8
+ pymupdf