ramysaidagieb commited on
Commit
e1fffd4
·
verified ·
1 Parent(s): 4ce9f60

Delete rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +0 -45
rag_pipeline.py DELETED
@@ -1,45 +0,0 @@
1
- from pathlib import Path
2
- from langchain.chains import RetrievalQA
3
- from transformers import pipeline, T5Tokenizer
4
- from langchain_community.vectorstores import Chroma
5
- from langchain_community.document_loaders import PyMuPDFLoader
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
8
-
9
- def load_documents(pdf_dir):
10
- docs = []
11
- for pdf_file in Path(pdf_dir).glob("*.pdf"):
12
- loader = PyMuPDFLoader(str(pdf_file))
13
- docs.extend(loader.load())
14
- return docs
15
-
16
- def load_rag_chain():
17
- pdf_dir = Path("data")
18
- pdf_dir.mkdir(parents=True, exist_ok=True)
19
-
20
- raw_docs = load_documents(pdf_dir)
21
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
22
- pages = splitter.split_documents(raw_docs)
23
-
24
- embeddings = HuggingFaceEmbeddings(
25
- model_name="sentence-transformers/LaBSE",
26
- model_kwargs={"device": "cpu"},
27
- )
28
-
29
- vectordb_dir = "chroma_db"
30
- vectordb = Chroma.from_documents(pages, embeddings, persist_directory=vectordb_dir)
31
- retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})
32
-
33
- # ✅ Use slow tokenizer explicitly
34
- tokenizer = T5Tokenizer.from_pretrained("ArabicNLP/mT5-base_ar", use_fast=False)
35
- hf_pipeline = pipeline(
36
- "text2text-generation",
37
- model="ArabicNLP/mT5-base_ar",
38
- tokenizer=tokenizer,
39
- max_new_tokens=512,
40
- temperature=0.3,
41
- device=-1,
42
- )
43
- llm = HuggingFacePipeline(pipeline=hf_pipeline)
44
-
45
- return RetrievalQA.from_llm(llm=llm, retriever=retriever)