Spaces:
Sleeping
Sleeping
Hamid Omarov
commited on
Commit
·
cd32be2
1
Parent(s):
3acec7e
Spaces: lazy LLM init + filepath upload
Browse files- day3/rag_system.py +26 -29
day3/rag_system.py
CHANGED
@@ -1,31 +1,41 @@
|
|
1 |
-
# rag_system.py
|
2 |
from typing import List, Dict
|
|
|
3 |
import chromadb
|
|
|
4 |
from pdf_loader import load_pdf
|
5 |
from optimal_chunker import OptimalChunker
|
6 |
from embeddings import embed_texts
|
|
|
|
|
7 |
from langchain_groq import ChatGroq
|
8 |
from dotenv import load_dotenv
|
9 |
-
import os
|
10 |
|
11 |
load_dotenv()
|
12 |
|
13 |
|
14 |
class RAGPipeline:
|
15 |
-
def __init__(self, persist_dir: str = "./
|
16 |
-
# Vector DB (Chroma 1.x
|
17 |
self.client = chromadb.PersistentClient(path=persist_dir)
|
18 |
self.col = self.client.get_or_create_collection(name=collection_name)
|
19 |
|
20 |
# Chunker
|
21 |
self.chunker = OptimalChunker()
|
22 |
|
23 |
-
# LLM (
|
24 |
-
self.llm =
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# 1) Load 2) Chunk 3) Embed 4) Upsert to Chroma
|
31 |
def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict:
|
@@ -35,8 +45,7 @@ class RAGPipeline:
|
|
35 |
summary = self.chunker.fit_on_text(text)
|
36 |
chunks = self.chunker.transform()
|
37 |
|
38 |
-
#
|
39 |
-
vectors = embed_texts(chunks)
|
40 |
ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))]
|
41 |
|
42 |
self.col.add(
|
@@ -49,6 +58,9 @@ class RAGPipeline:
|
|
49 |
|
50 |
# 5) Retrieve 6) Ask LLM
|
51 |
def query(self, question: str, k: int = 4) -> Dict:
|
|
|
|
|
|
|
52 |
results = self.col.query(query_texts=[question], n_results=k)
|
53 |
chunks: List[str] = results["documents"][0] if results.get("documents") else []
|
54 |
|
@@ -66,24 +78,9 @@ Context:
|
|
66 |
|
67 |
Answer (verbatim from Context):"""
|
68 |
resp = self.llm.invoke(prompt)
|
69 |
-
answer = resp.content.strip()
|
70 |
|
71 |
-
# Fallback if the model still hedges
|
72 |
if (not answer or answer.lower().startswith("i don't know")) and context.strip():
|
73 |
answer = chunks[0] if chunks else "I don't know"
|
74 |
|
75 |
-
return {
|
76 |
-
"answer": answer,
|
77 |
-
"used_chunks": len(chunks),
|
78 |
-
"context_preview": context[:500],
|
79 |
-
}
|
80 |
-
|
81 |
-
|
82 |
-
if __name__ == "__main__":
|
83 |
-
rag = RAGPipeline()
|
84 |
-
info = rag.index_document("sample.pdf") # ensure day3/sample.pdf exists
|
85 |
-
print("Indexed:", info)
|
86 |
-
|
87 |
-
out = rag.query("What text does the PDF contain?")
|
88 |
-
print("Answer:", out["answer"])
|
89 |
-
print("Used chunks:", out["used_chunks"])
|
|
|
1 |
+
# day3/rag_system.py
|
2 |
from typing import List, Dict
|
3 |
+
import os
|
4 |
import chromadb
|
5 |
+
|
6 |
from pdf_loader import load_pdf
|
7 |
from optimal_chunker import OptimalChunker
|
8 |
from embeddings import embed_texts
|
9 |
+
|
10 |
+
# LLM-i sonradan yaradacağıq
|
11 |
from langchain_groq import ChatGroq
|
12 |
from dotenv import load_dotenv
|
|
|
13 |
|
14 |
load_dotenv()
|
15 |
|
16 |
|
17 |
class RAGPipeline:
|
18 |
+
def __init__(self, persist_dir: str = "./chroma_db_space", collection_name: str = "pdf_docs"):
|
19 |
+
# Vector DB (Chroma 1.x)
|
20 |
self.client = chromadb.PersistentClient(path=persist_dir)
|
21 |
self.col = self.client.get_or_create_collection(name=collection_name)
|
22 |
|
23 |
# Chunker
|
24 |
self.chunker = OptimalChunker()
|
25 |
|
26 |
+
# LLM hələ YARADILMIR (lazy)
|
27 |
+
self.llm = None
|
28 |
+
|
29 |
+
def _ensure_llm(self):
|
30 |
+
"""GROQ_API_KEY varsa LLM-i gec (ilk sorğuda) yarat."""
|
31 |
+
if self.llm is None:
|
32 |
+
api_key = os.getenv("GROQ_API_KEY")
|
33 |
+
if not api_key:
|
34 |
+
# LLM olmadan da app işə düşsün deyə aydın mesaj veririk
|
35 |
+
raise RuntimeError(
|
36 |
+
"GROQ_API_KEY tapılmadı. Space Settings → Variables and secrets bölməsində əlavə edin."
|
37 |
+
)
|
38 |
+
self.llm = ChatGroq(model="llama3-8b-8192", temperature=0.0, api_key=api_key)
|
39 |
|
40 |
# 1) Load 2) Chunk 3) Embed 4) Upsert to Chroma
|
41 |
def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict:
|
|
|
45 |
summary = self.chunker.fit_on_text(text)
|
46 |
chunks = self.chunker.transform()
|
47 |
|
48 |
+
vectors = embed_texts(chunks) # list[list[float]]
|
|
|
49 |
ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))]
|
50 |
|
51 |
self.col.add(
|
|
|
58 |
|
59 |
# 5) Retrieve 6) Ask LLM
|
60 |
def query(self, question: str, k: int = 4) -> Dict:
|
61 |
+
# LLM-i bu zaman yaradacağıq (secret yoxdursa burda aydın xəta görünəcək)
|
62 |
+
self._ensure_llm()
|
63 |
+
|
64 |
results = self.col.query(query_texts=[question], n_results=k)
|
65 |
chunks: List[str] = results["documents"][0] if results.get("documents") else []
|
66 |
|
|
|
78 |
|
79 |
Answer (verbatim from Context):"""
|
80 |
resp = self.llm.invoke(prompt)
|
81 |
+
answer = resp.content.strip() if hasattr(resp, "content") else str(resp)
|
82 |
|
|
|
83 |
if (not answer or answer.lower().startswith("i don't know")) and context.strip():
|
84 |
answer = chunks[0] if chunks else "I don't know"
|
85 |
|
86 |
+
return {"answer": answer, "used_chunks": len(chunks), "context_preview": context[:500]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|