ramysaidagieb commited on
Commit
aacac9b
·
verified ·
1 Parent(s): eb1a7ba

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +21 -6
rag_pipeline.py CHANGED
@@ -2,25 +2,39 @@ from pathlib import Path
2
  from langchain.chains import RetrievalQA
3
  from transformers import pipeline, AutoTokenizer
4
  from langchain_community.vectorstores import Chroma
5
- from langchain_community.document_loaders import DirectoryLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
8
 
 
 
 
 
 
 
 
9
  def load_rag_chain():
 
10
  pdf_dir = Path("data")
11
- loader = DirectoryLoader(str(pdf_dir), glob="*.pdf")
12
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
13
- pages = loader.load_and_split(text_splitter=text_splitter)
 
 
 
14
 
 
15
  embeddings = HuggingFaceEmbeddings(
16
  model_name="sentence-transformers/all-MiniLM-L6-v2",
17
  model_kwargs={"device": "cpu"},
18
  )
19
 
 
20
  vectordb_dir = "chroma_db"
21
  vectordb = Chroma.from_documents(pages, embeddings, persist_directory=vectordb_dir)
22
  retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})
23
 
 
24
  hf_pipeline = pipeline(
25
  "text-generation",
26
  model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -28,9 +42,10 @@ def load_rag_chain():
28
  max_new_tokens=512,
29
  temperature=0.3,
30
  return_full_text=True,
31
- device=-1,
32
  )
33
  llm = HuggingFacePipeline(pipeline=hf_pipeline)
34
 
 
35
  qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever)
36
- return qa_chain
 
2
  from langchain.chains import RetrievalQA
3
  from transformers import pipeline, AutoTokenizer
4
  from langchain_community.vectorstores import Chroma
5
+ from langchain_community.document_loaders import PyMuPDFLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
8
 
9
+ def load_documents(pdf_dir):
10
+ docs = []
11
+ for pdf_file in Path(pdf_dir).glob("*.pdf"):
12
+ loader = PyMuPDFLoader(str(pdf_file))
13
+ docs.extend(loader.load())
14
+ return docs
15
+
16
  def load_rag_chain():
17
+ # Ensure directory exists
18
  pdf_dir = Path("data")
19
+ pdf_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ # Load and split PDFs
22
+ raw_docs = load_documents(pdf_dir)
23
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
24
+ pages = splitter.split_documents(raw_docs)
25
 
26
+ # Embedding model
27
  embeddings = HuggingFaceEmbeddings(
28
  model_name="sentence-transformers/all-MiniLM-L6-v2",
29
  model_kwargs={"device": "cpu"},
30
  )
31
 
32
+ # Vector database
33
  vectordb_dir = "chroma_db"
34
  vectordb = Chroma.from_documents(pages, embeddings, persist_directory=vectordb_dir)
35
  retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})
36
 
37
+ # LLM pipeline using free model
38
  hf_pipeline = pipeline(
39
  "text-generation",
40
  model="mistralai/Mistral-7B-Instruct-v0.2",
 
42
  max_new_tokens=512,
43
  temperature=0.3,
44
  return_full_text=True,
45
+ device=-1 # CPU
46
  )
47
  llm = HuggingFacePipeline(pipeline=hf_pipeline)
48
 
49
+ # QA Chain
50
  qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever)
51
+ return qa_chain