Spaces:

HamidOmarov
/

First_RAG_System

Sleeping

App Files Files Community

Hamid Omarov commited on 15 days ago

Commit

e7e9247

1 Parent(s): b834d82

HF Space app + minimal pipeline code (no secrets)

Browse files

Files changed (6) hide show

day3/chunking_test.py +83 -0
day3/embeddings.py +12 -0
day3/optimal_chunker.py +113 -0
day3/pdf_loader.py +13 -0
day3/rag_system.py +89 -0
day3/vector_store.py +15 -0

day3/chunking_test.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# chunking_test.py
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    RecursiveCharacterTextSplitter,
+    TokenTextSplitter,
+)
+from pdf_loader import load_pdf
+# GPT/Copilot: "utility to flatten pages into a single string"
+def docs_to_text(docs):
+    return "\n\n".join([d.page_content for d in docs])
+# GPT/Copilot: "run a splitter on text and return list[str]"
+def split_text(text, splitter):
+    return splitter.split_text(text)
+# GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting"
+def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None):
+    if unit == "chars":
+        sizes = [len(c) for c in chunks]
+        avg = sum(sizes) / len(sizes) if sizes else 0
+        return {
+            "chunks": len(chunks),
+            "avg_chars": round(avg, 1),
+            "overlap": chunk_overlap,
+        }
+    else:
+        # token mode will pass unit="tokens" and precomputed token sizes if needed
+        sizes = [len(c) for c in chunks]  # placeholder, we’ll report char length anyway
+        avg = sum(sizes) / len(sizes) if sizes else 0
+        return {
+            "chunks": len(chunks),
+            "avg_len_str": round(avg, 1),
+            "overlap": chunk_overlap,
+        }
+def run_comparison(pdf_path="sample.pdf"):
+    docs = load_pdf(pdf_path)
+    text = docs_to_text(docs)
+    # 1) Fixed size (CharacterTextSplitter)
+    fixed = CharacterTextSplitter(
+        chunk_size=800, chunk_overlap=100, separator="\n"
+    )
+    fixed_chunks = split_text(text, fixed)
+    fixed_metrics = compute_metrics(
+        fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100
+    )
+    # 2) Recursive (RecursiveCharacterTextSplitter)
+    recursive = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=100,
+        separators=["\n\n", "\n", " ", ""],
+    )
+    recursive_chunks = split_text(text, recursive)
+    recursive_metrics = compute_metrics(
+        recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100
+    )
+    # 3) Token-based (TokenTextSplitter)
+    token = TokenTextSplitter(
+        chunk_size=512,
+        chunk_overlap=64,
+    )
+    token_chunks = split_text(text, token)
+    token_metrics = compute_metrics(
+        token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64
+    )
+    print("=== Chunking Comparison ===")
+    print("Fixed (chars):   ", fixed_metrics)
+    print("Recursive (chars):", recursive_metrics)
+    print("Token-based:      ", token_metrics)
+    # Optional: show first chunk samples for sanity
+    print("\n--- Sample Chunks ---")
+    for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]:
+        preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "")
+        print(f"{name} #1 →", preview)
+if __name__ == "__main__":
+    run_comparison("sample.pdf")

day3/embeddings.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import List, Dict
+from sentence_transformers import SentenceTransformer
+_embedder = SentenceTransformer("all-MiniLM-L6-v2")
+def embed_texts(texts: List[str]) -> List[List[float]]:
+    # Return as Python lists of floats (Chroma-compatible)
+    return _embedder.encode(texts, convert_to_numpy=True).tolist()
+def create_embeddings(chunks: List[str]) -> Dict:
+    vectors = embed_texts(chunks)
+    return {"embeddings": vectors, "count": len(vectors)}

day3/optimal_chunker.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# optimal_chunker.py
+from typing import Dict, List, Tuple
+from statistics import mean
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    RecursiveCharacterTextSplitter,
+    TokenTextSplitter,
+)
+from pdf_loader import load_pdf
+# --- Helpers ---
+def docs_to_text(docs) -> str:
+    return "\n\n".join([d.page_content for d in docs])
+def run_splitter(text: str, splitter) -> List[str]:
+    return splitter.split_text(text)
+def metrics(chunks: List[str]) -> Dict:
+    if not chunks:
+        return {"chunks": 0, "avg_len": 0, "max_len": 0}
+    lens = [len(c) for c in chunks]
+    return {
+        "chunks": len(chunks),
+        "avg_len": round(mean(lens), 1),
+        "max_len": max(lens),
+    }
+# --- Strategy evaluation ---
+def evaluate_strategies(
+    text: str,
+    char_size: int = 800,
+    char_overlap: int = 100,
+    token_size: int = 512,
+    token_overlap: int = 64,
+) -> Dict[str, Dict]:
+    fixed = CharacterTextSplitter(chunk_size=char_size, chunk_overlap=char_overlap, separator="\n")
+    recursive = RecursiveCharacterTextSplitter(
+        chunk_size=char_size, chunk_overlap=char_overlap, separators=["\n\n", "\n", " ", ""]
+    )
+    token = TokenTextSplitter(chunk_size=token_size, chunk_overlap=token_overlap)
+    fixed_chunks = run_splitter(text, fixed)
+    rec_chunks = run_splitter(text, recursive)
+    tok_chunks = run_splitter(text, token)
+    return {
+        "fixed":   {"chunks": fixed_chunks, "metrics": metrics(fixed_chunks), "meta": {"size": char_size, "overlap": char_overlap, "unit": "chars"}},
+        "recursive": {"chunks": rec_chunks, "metrics": metrics(rec_chunks), "meta": {"size": char_size, "overlap": char_overlap, "unit": "chars"}},
+        "token":   {"chunks": tok_chunks, "metrics": metrics(tok_chunks), "meta": {"size": token_size, "overlap": token_overlap, "unit": "tokens"}},
+    }
+def score(candidate: Dict, target_avg: int = 800, hard_max: int = 1500) -> float:
+    """Lower is better: distance to target + penalty if max chunk too large."""
+    m = candidate["metrics"]
+    dist = abs(m["avg_len"] - target_avg)
+    penalty = 0 if m["max_len"] <= hard_max else (m["max_len"] - hard_max)
+    # Favor more, smaller chunks over 1 giant chunk
+    few_chunk_penalty = 500 if m["chunks"] <= 1 else 0
+    return dist + penalty + few_chunk_penalty
+def select_best(evals: Dict[str, Dict], target_avg: int = 800, hard_max: int = 1500) -> Tuple[str, Dict]:
+    scored = [(name, score(info, target_avg, hard_max)) for name, info in evals.items()]
+    scored.sort(key=lambda x: x[1])
+    return scored[0][0], evals[scored[0][0]]
+# --- Final pipeline API ---
+class OptimalChunker:
+    def __init__(
+        self,
+        char_size: int = 800,
+        char_overlap: int = 100,
+        token_size: int = 512,
+        token_overlap: int = 64,
+        target_avg: int = 800,
+        hard_max: int = 1500,
+    ):
+        self.char_size = char_size
+        self.char_overlap = char_overlap
+        self.token_size = token_size
+        self.token_overlap = token_overlap
+        self.target_avg = target_avg
+        self.hard_max = hard_max
+        self.best_name = None
+        self.best_info = None
+    def fit_on_text(self, text: str) -> Dict:
+        evals = evaluate_strategies(
+            text,
+            char_size=self.char_size,
+            char_overlap=self.char_overlap,
+            token_size=self.token_size,
+            token_overlap=self.token_overlap,
+        )
+        self.best_name, self.best_info = select_best(evals, self.target_avg, self.hard_max)
+        return {"best": self.best_name, "metrics": self.best_info["metrics"], "meta": self.best_info["meta"]}
+    def transform(self) -> List[str]:
+        assert self.best_info is not None, "Call fit_on_text first."
+        return self.best_info["chunks"]
+    def fit_transform_pdf(self, pdf_path: str) -> Tuple[str, List[str], Dict]:
+        docs = load_pdf(pdf_path)
+        text = docs_to_text(docs)
+        summary = self.fit_on_text(text)
+        return self.best_name, self.transform(), summary
+if __name__ == "__main__":
+    # Demo on sample.pdf
+    ch = OptimalChunker()
+    best, chunks, summary = ch.fit_transform_pdf("sample.pdf")
+    print("=== Best Strategy ===")
+    print(best, summary)
+    print(f"First chunk preview:\n{chunks[0][:300] if chunks else ''}")

day3/pdf_loader.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from langchain_community.document_loaders import PyPDFLoader
+def load_pdf(file_path):
+    loader = PyPDFLoader(file_path)
+    pages = loader.load()
+    return pages
+if __name__ == "__main__":
+    docs = load_pdf("sample.pdf")
+    print(f"✅ Loaded {len(docs)} pages")
+    for i, page in enumerate(docs, start=1):
+        print(f"--- Page {i} ---")
+        print(page.page_content)

day3/rag_system.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# rag_system.py
+from typing import List, Dict
+import chromadb
+from pdf_loader import load_pdf
+from optimal_chunker import OptimalChunker
+from embeddings import embed_texts
+from langchain_groq import ChatGroq
+from dotenv import load_dotenv
+import os
+load_dotenv()
+class RAGPipeline:
+    def __init__(self, persist_dir: str = "./chroma_db", collection_name: str = "pdf_docs"):
+        # Vector DB (Chroma 1.x new API)
+        self.client = chromadb.PersistentClient(path=persist_dir)
+        self.col = self.client.get_or_create_collection(name=collection_name)
+        # Chunker
+        self.chunker = OptimalChunker()
+        # LLM (Groq)
+        self.llm = ChatGroq(
+            model="llama3-8b-8192",
+            temperature=0.0,
+            api_key=os.getenv("GROQ_API_KEY"),
+        )
+    # 1) Load  2) Chunk  3) Embed  4) Upsert to Chroma
+    def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict:
+        docs = load_pdf(pdf_path)
+        text = "\n\n".join(d.page_content for d in docs)
+        summary = self.chunker.fit_on_text(text)
+        chunks = self.chunker.transform()
+        # embeddings: list[list[float]]
+        vectors = embed_texts(chunks)
+        ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))]
+        self.col.add(
+            ids=ids,
+            documents=chunks,
+            embeddings=vectors,
+            metadatas=[{"source": pdf_path, "chunk": i} for i in range(len(chunks))],
+        )
+        return {"chunks_indexed": len(chunks), "best_strategy": summary}
+    # 5) Retrieve  6) Ask LLM
+    def query(self, question: str, k: int = 4) -> Dict:
+        results = self.col.query(query_texts=[question], n_results=k)
+        chunks: List[str] = results["documents"][0] if results.get("documents") else []
+        context = "\n\n".join(chunks)
+        prompt = f"""You are an extraction assistant. Use ONLY the Context to answer.
+Rules:
+- If the answer is explicitly present in Context, return that substring EXACTLY.
+- Do not paraphrase. Do not add words. Return a verbatim span from Context.
+- If the answer is not in Context, reply exactly: I don't know
+Question: {question}
+Context:
+{context}
+Answer (verbatim from Context):"""
+        resp = self.llm.invoke(prompt)
+        answer = resp.content.strip()
+        # Fallback if the model still hedges
+        if (not answer or answer.lower().startswith("i don't know")) and context.strip():
+            answer = chunks[0] if chunks else "I don't know"
+        return {
+            "answer": answer,
+            "used_chunks": len(chunks),
+            "context_preview": context[:500],
+        }
+if __name__ == "__main__":
+    rag = RAGPipeline()
+    info = rag.index_document("sample.pdf")  # ensure day3/sample.pdf exists
+    print("Indexed:", info)
+    out = rag.query("What text does the PDF contain?")
+    print("Answer:", out["answer"])
+    print("Used chunks:", out["used_chunks"])

day3/vector_store.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# vector_store.py
+import chromadb
+# New persistent client (replaces Settings / duckdb+parquet)
+client = chromadb.PersistentClient(path="./chroma_db")
+# Create or get collection
+collection = client.get_or_create_collection("pdf_docs")
+def reset_db():
+    client.delete_collection("pdf_docs")
+    return client.get_or_create_collection("pdf_docs")
+if __name__ == "__main__":
+    print("ChromaDB ready. Collections:", [c.name for c in client.list_collections()])