Spaces:

HamidOmarov
/

First_RAG_System

Sleeping

App Files Files Community

HamidOmarov commited on 9 days ago

Commit

a1e8893

verified ·

1 Parent(s): ed5341f

Delete day3

Browse files

Files changed (8) hide show

day3/chunking_test.py +0 -83
day3/embeddings.py +0 -12
day3/gradio_rag.py +0 -44
day3/optimal_chunker.py +0 -113
day3/pdf_loader.py +0 -13
day3/rag_system.py +0 -86
day3/requirements.txt +0 -6
day3/vector_store.py +0 -15

day3/chunking_test.py DELETED Viewed

@@ -1,83 +0,0 @@
-# chunking_test.py
-from langchain.text_splitter import (
-    CharacterTextSplitter,
-    RecursiveCharacterTextSplitter,
-    TokenTextSplitter,
-)
-from pdf_loader import load_pdf
-# GPT/Copilot: "utility to flatten pages into a single string"
-def docs_to_text(docs):
-    return "\n\n".join([d.page_content for d in docs])
-# GPT/Copilot: "run a splitter on text and return list[str]"
-def split_text(text, splitter):
-    return splitter.split_text(text)
-# GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting"
-def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None):
-    if unit == "chars":
-        sizes = [len(c) for c in chunks]
-        avg = sum(sizes) / len(sizes) if sizes else 0
-        return {
-            "chunks": len(chunks),
-            "avg_chars": round(avg, 1),
-            "overlap": chunk_overlap,
-        }
-    else:
-        # token mode will pass unit="tokens" and precomputed token sizes if needed
-        sizes = [len(c) for c in chunks]  # placeholder, we’ll report char length anyway
-        avg = sum(sizes) / len(sizes) if sizes else 0
-        return {
-            "chunks": len(chunks),
-            "avg_len_str": round(avg, 1),
-            "overlap": chunk_overlap,
-        }
-def run_comparison(pdf_path="sample.pdf"):
-    docs = load_pdf(pdf_path)
-    text = docs_to_text(docs)
-    # 1) Fixed size (CharacterTextSplitter)
-    fixed = CharacterTextSplitter(
-        chunk_size=800, chunk_overlap=100, separator="\n"
-    )
-    fixed_chunks = split_text(text, fixed)
-    fixed_metrics = compute_metrics(
-        fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100
-    )
-    # 2) Recursive (RecursiveCharacterTextSplitter)
-    recursive = RecursiveCharacterTextSplitter(
-        chunk_size=800,
-        chunk_overlap=100,
-        separators=["\n\n", "\n", " ", ""],
-    )
-    recursive_chunks = split_text(text, recursive)
-    recursive_metrics = compute_metrics(
-        recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100
-    )
-    # 3) Token-based (TokenTextSplitter)
-    token = TokenTextSplitter(
-        chunk_size=512,
-        chunk_overlap=64,
-    )
-    token_chunks = split_text(text, token)
-    token_metrics = compute_metrics(
-        token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64
-    )
-    print("=== Chunking Comparison ===")
-    print("Fixed (chars):   ", fixed_metrics)
-    print("Recursive (chars):", recursive_metrics)
-    print("Token-based:      ", token_metrics)
-    # Optional: show first chunk samples for sanity
-    print("\n--- Sample Chunks ---")
-    for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]:
-        preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "")
-        print(f"{name} #1 →", preview)
-if __name__ == "__main__":
-    run_comparison("sample.pdf")

day3/embeddings.py DELETED Viewed

@@ -1,12 +0,0 @@
-from typing import List, Dict
-from sentence_transformers import SentenceTransformer
-_embedder = SentenceTransformer("all-MiniLM-L6-v2")
-def embed_texts(texts: List[str]) -> List[List[float]]:
-    # Return as Python lists of floats (Chroma-compatible)
-    return _embedder.encode(texts, convert_to_numpy=True).tolist()
-def create_embeddings(chunks: List[str]) -> Dict:
-    vectors = embed_texts(chunks)
-    return {"embeddings": vectors, "count": len(vectors)}

day3/gradio_rag.py DELETED Viewed

@@ -1,44 +0,0 @@
-# day3/gradio_rag.py
-import gradio as gr
-from dotenv import load_dotenv
-from rag_system import RAGPipeline
-import traceback
-import os
-load_dotenv()  # ensure GROQ_API_KEY is loaded for ChatGroq
-# Use a writable persistent dir
-rag = RAGPipeline(persist_dir="./chroma_db_space", collection_name="pdf_docs")
-def chat_with_pdf(pdf_path: str, question: str):
-    try:
-        if not pdf_path:
-            return "Please upload a PDF."
-        if not question or not question.strip():
-            return "Please enter a question."
-        # Index the uploaded PDF (path is a string because of type='filepath')
-        rag.index_document(pdf_path, doc_id_prefix="upload")
-        # Ask
-        out = rag.query(question, k=4)
-        return out["answer"]
-    except Exception as e:
-        # Surface the exact error to the UI for debugging
-        return f"Error: {e}\n\n{traceback.format_exc()}"
-demo = gr.Interface(
-    fn=chat_with_pdf,
-    inputs=[
-        gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath"),
-        gr.Textbox(label="Ask a question", placeholder="What does the PDF say?"),
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="PDF RAG (Chroma + Groq)",
-    description="Upload a PDF and ask a question. Uses Chroma for retrieval and Groq LLM for answers."
-)
-if __name__ == "__main__":
-    # Optional: show whether the env var is visible to this process
-    print("GROQ key present:", bool(os.getenv("GROQ_API_KEY")))
-    demo.launch()

day3/optimal_chunker.py DELETED Viewed

@@ -1,113 +0,0 @@
-# optimal_chunker.py
-from typing import Dict, List, Tuple
-from statistics import mean
-from langchain.text_splitter import (
-    CharacterTextSplitter,
-    RecursiveCharacterTextSplitter,
-    TokenTextSplitter,
-)
-from pdf_loader import load_pdf
-# --- Helpers ---
-def docs_to_text(docs) -> str:
-    return "\n\n".join([d.page_content for d in docs])
-def run_splitter(text: str, splitter) -> List[str]:
-    return splitter.split_text(text)
-def metrics(chunks: List[str]) -> Dict:
-    if not chunks:
-        return {"chunks": 0, "avg_len": 0, "max_len": 0}
-    lens = [len(c) for c in chunks]
-    return {
-        "chunks": len(chunks),
-        "avg_len": round(mean(lens), 1),
-        "max_len": max(lens),
-    }
-# --- Strategy evaluation ---
-def evaluate_strategies(
-    text: str,
-    char_size: int = 800,
-    char_overlap: int = 100,
-    token_size: int = 512,
-    token_overlap: int = 64,
-) -> Dict[str, Dict]:
-    fixed = CharacterTextSplitter(chunk_size=char_size, chunk_overlap=char_overlap, separator="\n")
-    recursive = RecursiveCharacterTextSplitter(
-        chunk_size=char_size, chunk_overlap=char_overlap, separators=["\n\n", "\n", " ", ""]
-    )
-    token = TokenTextSplitter(chunk_size=token_size, chunk_overlap=token_overlap)
-    fixed_chunks = run_splitter(text, fixed)
-    rec_chunks = run_splitter(text, recursive)
-    tok_chunks = run_splitter(text, token)
-    return {
-        "fixed":   {"chunks": fixed_chunks, "metrics": metrics(fixed_chunks), "meta": {"size": char_size, "overlap": char_overlap, "unit": "chars"}},
-        "recursive": {"chunks": rec_chunks, "metrics": metrics(rec_chunks), "meta": {"size": char_size, "overlap": char_overlap, "unit": "chars"}},
-        "token":   {"chunks": tok_chunks, "metrics": metrics(tok_chunks), "meta": {"size": token_size, "overlap": token_overlap, "unit": "tokens"}},
-    }
-def score(candidate: Dict, target_avg: int = 800, hard_max: int = 1500) -> float:
-    """Lower is better: distance to target + penalty if max chunk too large."""
-    m = candidate["metrics"]
-    dist = abs(m["avg_len"] - target_avg)
-    penalty = 0 if m["max_len"] <= hard_max else (m["max_len"] - hard_max)
-    # Favor more, smaller chunks over 1 giant chunk
-    few_chunk_penalty = 500 if m["chunks"] <= 1 else 0
-    return dist + penalty + few_chunk_penalty
-def select_best(evals: Dict[str, Dict], target_avg: int = 800, hard_max: int = 1500) -> Tuple[str, Dict]:
-    scored = [(name, score(info, target_avg, hard_max)) for name, info in evals.items()]
-    scored.sort(key=lambda x: x[1])
-    return scored[0][0], evals[scored[0][0]]
-# --- Final pipeline API ---
-class OptimalChunker:
-    def __init__(
-        self,
-        char_size: int = 800,
-        char_overlap: int = 100,
-        token_size: int = 512,
-        token_overlap: int = 64,
-        target_avg: int = 800,
-        hard_max: int = 1500,
-    ):
-        self.char_size = char_size
-        self.char_overlap = char_overlap
-        self.token_size = token_size
-        self.token_overlap = token_overlap
-        self.target_avg = target_avg
-        self.hard_max = hard_max
-        self.best_name = None
-        self.best_info = None
-    def fit_on_text(self, text: str) -> Dict:
-        evals = evaluate_strategies(
-            text,
-            char_size=self.char_size,
-            char_overlap=self.char_overlap,
-            token_size=self.token_size,
-            token_overlap=self.token_overlap,
-        )
-        self.best_name, self.best_info = select_best(evals, self.target_avg, self.hard_max)
-        return {"best": self.best_name, "metrics": self.best_info["metrics"], "meta": self.best_info["meta"]}
-    def transform(self) -> List[str]:
-        assert self.best_info is not None, "Call fit_on_text first."
-        return self.best_info["chunks"]
-    def fit_transform_pdf(self, pdf_path: str) -> Tuple[str, List[str], Dict]:
-        docs = load_pdf(pdf_path)
-        text = docs_to_text(docs)
-        summary = self.fit_on_text(text)
-        return self.best_name, self.transform(), summary
-if __name__ == "__main__":
-    # Demo on sample.pdf
-    ch = OptimalChunker()
-    best, chunks, summary = ch.fit_transform_pdf("sample.pdf")
-    print("=== Best Strategy ===")
-    print(best, summary)
-    print(f"First chunk preview:\n{chunks[0][:300] if chunks else ''}")

day3/pdf_loader.py DELETED Viewed

@@ -1,13 +0,0 @@
-from langchain_community.document_loaders import PyPDFLoader
-def load_pdf(file_path):
-    loader = PyPDFLoader(file_path)
-    pages = loader.load()
-    return pages
-if __name__ == "__main__":
-    docs = load_pdf("sample.pdf")
-    print(f"✅ Loaded {len(docs)} pages")
-    for i, page in enumerate(docs, start=1):
-        print(f"--- Page {i} ---")
-        print(page.page_content)

day3/rag_system.py DELETED Viewed

@@ -1,86 +0,0 @@
-# day3/rag_system.py
-from typing import List, Dict
-import os
-import chromadb
-from pdf_loader import load_pdf
-from optimal_chunker import OptimalChunker
-from embeddings import embed_texts
-# LLM-i sonradan yaradacağıq
-from langchain_groq import ChatGroq
-from dotenv import load_dotenv
-load_dotenv()
-class RAGPipeline:
-    def __init__(self, persist_dir: str = "./chroma_db_space", collection_name: str = "pdf_docs"):
-        # Vector DB (Chroma 1.x)
-        self.client = chromadb.PersistentClient(path=persist_dir)
-        self.col = self.client.get_or_create_collection(name=collection_name)
-        # Chunker
-        self.chunker = OptimalChunker()
-        # LLM hələ YARADILMIR (lazy)
-        self.llm = None
-    def _ensure_llm(self):
-        """GROQ_API_KEY varsa LLM-i gec (ilk sorğuda) yarat."""
-        if self.llm is None:
-            api_key = os.getenv("GROQ_API_KEY")
-            if not api_key:
-                # LLM olmadan da app işə düşsün deyə aydın mesaj veririk
-                raise RuntimeError(
-                    "GROQ_API_KEY tapılmadı. Space Settings → Variables and secrets bölməsində əlavə edin."
-                )
-            self.llm = ChatGroq(model="llama3-8b-8192", temperature=0.0, api_key=api_key)
-    # 1) Load  2) Chunk  3) Embed  4) Upsert to Chroma
-    def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict:
-        docs = load_pdf(pdf_path)
-        text = "\n\n".join(d.page_content for d in docs)
-        summary = self.chunker.fit_on_text(text)
-        chunks = self.chunker.transform()
-        vectors = embed_texts(chunks)  # list[list[float]]
-        ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))]
-        self.col.add(
-            ids=ids,
-            documents=chunks,
-            embeddings=vectors,
-            metadatas=[{"source": pdf_path, "chunk": i} for i in range(len(chunks))],
-        )
-        return {"chunks_indexed": len(chunks), "best_strategy": summary}
-    # 5) Retrieve  6) Ask LLM
-    def query(self, question: str, k: int = 4) -> Dict:
-        # LLM-i bu zaman yaradacağıq (secret yoxdursa burda aydın xəta görünəcək)
-        self._ensure_llm()
-        results = self.col.query(query_texts=[question], n_results=k)
-        chunks: List[str] = results["documents"][0] if results.get("documents") else []
-        context = "\n\n".join(chunks)
-        prompt = f"""You are an extraction assistant. Use ONLY the Context to answer.
-Rules:
-- If the answer is explicitly present in Context, return that substring EXACTLY.
-- Do not paraphrase. Do not add words. Return a verbatim span from Context.
-- If the answer is not in Context, reply exactly: I don't know
-Question: {question}
-Context:
-{context}
-Answer (verbatim from Context):"""
-        resp = self.llm.invoke(prompt)
-        answer = resp.content.strip() if hasattr(resp, "content") else str(resp)
-        if (not answer or answer.lower().startswith("i don't know")) and context.strip():
-            answer = chunks[0] if chunks else "I don't know"
-        return {"answer": answer, "used_chunks": len(chunks), "context_preview": context[:500]}

day3/requirements.txt DELETED Viewed

@@ -1,6 +0,0 @@
-gradio
-chromadb
-sentence-transformers
-langchain-groq
-pypdf
-python-dotenv

day3/vector_store.py DELETED Viewed

@@ -1,15 +0,0 @@
-# vector_store.py
-import chromadb
-# New persistent client (replaces Settings / duckdb+parquet)
-client = chromadb.PersistentClient(path="./chroma_db")
-# Create or get collection
-collection = client.get_or_create_collection("pdf_docs")
-def reset_db():
-    client.delete_collection("pdf_docs")
-    return client.get_or_create_collection("pdf_docs")
-if __name__ == "__main__":
-    print("ChromaDB ready. Collections:", [c.name for c in client.list_collections()])