ramysaidagieb commited on
Commit
75ebc6b
·
verified ·
1 Parent(s): c78cfc1

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +10 -10
rag_pipeline.py CHANGED
@@ -6,6 +6,7 @@ from langchain_community.document_loaders import PyMuPDFLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
8
 
 
9
  def load_documents(pdf_dir):
10
  docs = []
11
  for pdf_file in Path(pdf_dir).glob("*.pdf"):
@@ -14,7 +15,7 @@ def load_documents(pdf_dir):
14
  return docs
15
 
16
  def load_rag_chain():
17
- # Ensure directory exists
18
  pdf_dir = Path("data")
19
  pdf_dir.mkdir(parents=True, exist_ok=True)
20
 
@@ -23,29 +24,28 @@ def load_rag_chain():
23
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
24
  pages = splitter.split_documents(raw_docs)
25
 
26
- # Embedding model
27
  embeddings = HuggingFaceEmbeddings(
28
  model_name="sentence-transformers/all-MiniLM-L6-v2",
29
  model_kwargs={"device": "cpu"},
30
  )
31
 
32
- # Vector database
33
  vectordb_dir = "chroma_db"
34
  vectordb = Chroma.from_documents(pages, embeddings, persist_directory=vectordb_dir)
35
  retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})
36
 
37
- # LLM pipeline using free model
38
  hf_pipeline = pipeline(
39
- "text-generation",
40
- model="mistralai/Mistral-7B-Instruct-v0.2",
41
- tokenizer=AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2"),
42
  max_new_tokens=512,
43
  temperature=0.3,
44
- return_full_text=True,
45
- device=-1 # CPU
46
  )
47
  llm = HuggingFacePipeline(pipeline=hf_pipeline)
48
 
49
- # QA Chain
50
  qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever)
51
  return qa_chain
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
8
 
9
+ # Load all PDFs from the data folder
10
  def load_documents(pdf_dir):
11
  docs = []
12
  for pdf_file in Path(pdf_dir).glob("*.pdf"):
 
15
  return docs
16
 
17
  def load_rag_chain():
18
+ # Make sure the data directory exists
19
  pdf_dir = Path("data")
20
  pdf_dir.mkdir(parents=True, exist_ok=True)
21
 
 
24
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
25
  pages = splitter.split_documents(raw_docs)
26
 
27
+ # Load sentence transformer for embeddings
28
  embeddings = HuggingFaceEmbeddings(
29
  model_name="sentence-transformers/all-MiniLM-L6-v2",
30
  model_kwargs={"device": "cpu"},
31
  )
32
 
33
+ # Vector store
34
  vectordb_dir = "chroma_db"
35
  vectordb = Chroma.from_documents(pages, embeddings, persist_directory=vectordb_dir)
36
  retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})
37
 
38
+ # Load a completely free and CPU-compatible model
39
  hf_pipeline = pipeline(
40
+ "text2text-generation",
41
+ model="google/flan-t5-base",
42
+ tokenizer=AutoTokenizer.from_pretrained("google/flan-t5-base"),
43
  max_new_tokens=512,
44
  temperature=0.3,
45
+ device=-1 # -1 means CPU
 
46
  )
47
  llm = HuggingFacePipeline(pipeline=hf_pipeline)
48
 
49
+ # Build RetrievalQA chain
50
  qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever)
51
  return qa_chain