Spaces:

HamidOmarov
/

First_RAG_System

Sleeping

Hamid Omarov

HF Space app + minimal pipeline code (no secrets)

e7e9247 16 days ago

2.93 kB

	# rag_system.py
	from typing import List, Dict
	import chromadb
	from pdf_loader import load_pdf
	from optimal_chunker import OptimalChunker
	from embeddings import embed_texts
	from langchain_groq import ChatGroq
	from dotenv import load_dotenv
	import os

	load_dotenv()


	class RAGPipeline:
	def __init__(self, persist_dir: str = "./chroma_db", collection_name: str = "pdf_docs"):
	# Vector DB (Chroma 1.x new API)
	self.client = chromadb.PersistentClient(path=persist_dir)
	self.col = self.client.get_or_create_collection(name=collection_name)

	# Chunker
	self.chunker = OptimalChunker()

	# LLM (Groq)
	self.llm = ChatGroq(
	model="llama3-8b-8192",
	temperature=0.0,
	api_key=os.getenv("GROQ_API_KEY"),
	)

	# 1) Load 2) Chunk 3) Embed 4) Upsert to Chroma
	def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict:
	docs = load_pdf(pdf_path)
	text = "\n\n".join(d.page_content for d in docs)

	summary = self.chunker.fit_on_text(text)
	chunks = self.chunker.transform()

	# embeddings: list[list[float]]
	vectors = embed_texts(chunks)
	ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))]

	self.col.add(
	ids=ids,
	documents=chunks,
	embeddings=vectors,
	metadatas=[{"source": pdf_path, "chunk": i} for i in range(len(chunks))],
	)
	return {"chunks_indexed": len(chunks), "best_strategy": summary}

	# 5) Retrieve 6) Ask LLM
	def query(self, question: str, k: int = 4) -> Dict:
	results = self.col.query(query_texts=[question], n_results=k)
	chunks: List[str] = results["documents"][0] if results.get("documents") else []

	context = "\n\n".join(chunks)
	prompt = f"""You are an extraction assistant. Use ONLY the Context to answer.
	Rules:
	- If the answer is explicitly present in Context, return that substring EXACTLY.
	- Do not paraphrase. Do not add words. Return a verbatim span from Context.
	- If the answer is not in Context, reply exactly: I don't know

	Question: {question}

	Context:
	{context}

	Answer (verbatim from Context):"""
	resp = self.llm.invoke(prompt)
	answer = resp.content.strip()

	# Fallback if the model still hedges
	if (not answer or answer.lower().startswith("i don't know")) and context.strip():
	answer = chunks[0] if chunks else "I don't know"

	return {
	"answer": answer,
	"used_chunks": len(chunks),
	"context_preview": context[:500],
	}


	if __name__ == "__main__":
	rag = RAGPipeline()
	info = rag.index_document("sample.pdf") # ensure day3/sample.pdf exists
	print("Indexed:", info)

	out = rag.query("What text does the PDF contain?")
	print("Answer:", out["answer"])
	print("Used chunks:", out["used_chunks"])