Spaces:
Sleeping
Sleeping
# day3/rag_system.py | |
from typing import List, Dict | |
import os | |
import chromadb | |
from pdf_loader import load_pdf | |
from optimal_chunker import OptimalChunker | |
from embeddings import embed_texts | |
# LLM-i sonradan yaradacağıq | |
from langchain_groq import ChatGroq | |
from dotenv import load_dotenv | |
load_dotenv() | |
class RAGPipeline: | |
def __init__(self, persist_dir: str = "./chroma_db_space", collection_name: str = "pdf_docs"): | |
# Vector DB (Chroma 1.x) | |
self.client = chromadb.PersistentClient(path=persist_dir) | |
self.col = self.client.get_or_create_collection(name=collection_name) | |
# Chunker | |
self.chunker = OptimalChunker() | |
# LLM hələ YARADILMIR (lazy) | |
self.llm = None | |
def _ensure_llm(self): | |
"""GROQ_API_KEY varsa LLM-i gec (ilk sorğuda) yarat.""" | |
if self.llm is None: | |
api_key = os.getenv("GROQ_API_KEY") | |
if not api_key: | |
# LLM olmadan da app işə düşsün deyə aydın mesaj veririk | |
raise RuntimeError( | |
"GROQ_API_KEY tapılmadı. Space Settings → Variables and secrets bölməsində əlavə edin." | |
) | |
self.llm = ChatGroq(model="llama3-8b-8192", temperature=0.0, api_key=api_key) | |
# 1) Load 2) Chunk 3) Embed 4) Upsert to Chroma | |
def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict: | |
docs = load_pdf(pdf_path) | |
text = "\n\n".join(d.page_content for d in docs) | |
summary = self.chunker.fit_on_text(text) | |
chunks = self.chunker.transform() | |
vectors = embed_texts(chunks) # list[list[float]] | |
ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))] | |
self.col.add( | |
ids=ids, | |
documents=chunks, | |
embeddings=vectors, | |
metadatas=[{"source": pdf_path, "chunk": i} for i in range(len(chunks))], | |
) | |
return {"chunks_indexed": len(chunks), "best_strategy": summary} | |
# 5) Retrieve 6) Ask LLM | |
def query(self, question: str, k: int = 4) -> Dict: | |
# LLM-i bu zaman yaradacağıq (secret yoxdursa burda aydın xəta görünəcək) | |
self._ensure_llm() | |
results = self.col.query(query_texts=[question], n_results=k) | |
chunks: List[str] = results["documents"][0] if results.get("documents") else [] | |
context = "\n\n".join(chunks) | |
prompt = f"""You are an extraction assistant. Use ONLY the Context to answer. | |
Rules: | |
- If the answer is explicitly present in Context, return that substring EXACTLY. | |
- Do not paraphrase. Do not add words. Return a verbatim span from Context. | |
- If the answer is not in Context, reply exactly: I don't know | |
Question: {question} | |
Context: | |
{context} | |
Answer (verbatim from Context):""" | |
resp = self.llm.invoke(prompt) | |
answer = resp.content.strip() if hasattr(resp, "content") else str(resp) | |
if (not answer or answer.lower().startswith("i don't know")) and context.strip(): | |
answer = chunks[0] if chunks else "I don't know" | |
return {"answer": answer, "used_chunks": len(chunks), "context_preview": context[:500]} | |