HamidOmarov commited on
Commit
a1e8893
·
verified ·
1 Parent(s): ed5341f

Delete day3

Browse files
day3/chunking_test.py DELETED
@@ -1,83 +0,0 @@
1
- # chunking_test.py
2
- from langchain.text_splitter import (
3
- CharacterTextSplitter,
4
- RecursiveCharacterTextSplitter,
5
- TokenTextSplitter,
6
- )
7
- from pdf_loader import load_pdf
8
-
9
- # GPT/Copilot: "utility to flatten pages into a single string"
10
- def docs_to_text(docs):
11
- return "\n\n".join([d.page_content for d in docs])
12
-
13
- # GPT/Copilot: "run a splitter on text and return list[str]"
14
- def split_text(text, splitter):
15
- return splitter.split_text(text)
16
-
17
- # GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting"
18
- def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None):
19
- if unit == "chars":
20
- sizes = [len(c) for c in chunks]
21
- avg = sum(sizes) / len(sizes) if sizes else 0
22
- return {
23
- "chunks": len(chunks),
24
- "avg_chars": round(avg, 1),
25
- "overlap": chunk_overlap,
26
- }
27
- else:
28
- # token mode will pass unit="tokens" and precomputed token sizes if needed
29
- sizes = [len(c) for c in chunks] # placeholder, we’ll report char length anyway
30
- avg = sum(sizes) / len(sizes) if sizes else 0
31
- return {
32
- "chunks": len(chunks),
33
- "avg_len_str": round(avg, 1),
34
- "overlap": chunk_overlap,
35
- }
36
-
37
- def run_comparison(pdf_path="sample.pdf"):
38
- docs = load_pdf(pdf_path)
39
- text = docs_to_text(docs)
40
-
41
- # 1) Fixed size (CharacterTextSplitter)
42
- fixed = CharacterTextSplitter(
43
- chunk_size=800, chunk_overlap=100, separator="\n"
44
- )
45
- fixed_chunks = split_text(text, fixed)
46
- fixed_metrics = compute_metrics(
47
- fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100
48
- )
49
-
50
- # 2) Recursive (RecursiveCharacterTextSplitter)
51
- recursive = RecursiveCharacterTextSplitter(
52
- chunk_size=800,
53
- chunk_overlap=100,
54
- separators=["\n\n", "\n", " ", ""],
55
- )
56
- recursive_chunks = split_text(text, recursive)
57
- recursive_metrics = compute_metrics(
58
- recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100
59
- )
60
-
61
- # 3) Token-based (TokenTextSplitter)
62
- token = TokenTextSplitter(
63
- chunk_size=512,
64
- chunk_overlap=64,
65
- )
66
- token_chunks = split_text(text, token)
67
- token_metrics = compute_metrics(
68
- token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64
69
- )
70
-
71
- print("=== Chunking Comparison ===")
72
- print("Fixed (chars): ", fixed_metrics)
73
- print("Recursive (chars):", recursive_metrics)
74
- print("Token-based: ", token_metrics)
75
-
76
- # Optional: show first chunk samples for sanity
77
- print("\n--- Sample Chunks ---")
78
- for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]:
79
- preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "")
80
- print(f"{name} #1 →", preview)
81
-
82
- if __name__ == "__main__":
83
- run_comparison("sample.pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
day3/embeddings.py DELETED
@@ -1,12 +0,0 @@
1
- from typing import List, Dict
2
- from sentence_transformers import SentenceTransformer
3
-
4
- _embedder = SentenceTransformer("all-MiniLM-L6-v2")
5
-
6
- def embed_texts(texts: List[str]) -> List[List[float]]:
7
- # Return as Python lists of floats (Chroma-compatible)
8
- return _embedder.encode(texts, convert_to_numpy=True).tolist()
9
-
10
- def create_embeddings(chunks: List[str]) -> Dict:
11
- vectors = embed_texts(chunks)
12
- return {"embeddings": vectors, "count": len(vectors)}
 
 
 
 
 
 
 
 
 
 
 
 
 
day3/gradio_rag.py DELETED
@@ -1,44 +0,0 @@
1
- # day3/gradio_rag.py
2
- import gradio as gr
3
- from dotenv import load_dotenv
4
- from rag_system import RAGPipeline
5
- import traceback
6
- import os
7
-
8
- load_dotenv() # ensure GROQ_API_KEY is loaded for ChatGroq
9
-
10
- # Use a writable persistent dir
11
- rag = RAGPipeline(persist_dir="./chroma_db_space", collection_name="pdf_docs")
12
-
13
- def chat_with_pdf(pdf_path: str, question: str):
14
- try:
15
- if not pdf_path:
16
- return "Please upload a PDF."
17
- if not question or not question.strip():
18
- return "Please enter a question."
19
-
20
- # Index the uploaded PDF (path is a string because of type='filepath')
21
- rag.index_document(pdf_path, doc_id_prefix="upload")
22
-
23
- # Ask
24
- out = rag.query(question, k=4)
25
- return out["answer"]
26
- except Exception as e:
27
- # Surface the exact error to the UI for debugging
28
- return f"Error: {e}\n\n{traceback.format_exc()}"
29
-
30
- demo = gr.Interface(
31
- fn=chat_with_pdf,
32
- inputs=[
33
- gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath"),
34
- gr.Textbox(label="Ask a question", placeholder="What does the PDF say?"),
35
- ],
36
- outputs=gr.Textbox(label="Answer"),
37
- title="PDF RAG (Chroma + Groq)",
38
- description="Upload a PDF and ask a question. Uses Chroma for retrieval and Groq LLM for answers."
39
- )
40
-
41
- if __name__ == "__main__":
42
- # Optional: show whether the env var is visible to this process
43
- print("GROQ key present:", bool(os.getenv("GROQ_API_KEY")))
44
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
day3/optimal_chunker.py DELETED
@@ -1,113 +0,0 @@
1
- # optimal_chunker.py
2
- from typing import Dict, List, Tuple
3
- from statistics import mean
4
- from langchain.text_splitter import (
5
- CharacterTextSplitter,
6
- RecursiveCharacterTextSplitter,
7
- TokenTextSplitter,
8
- )
9
- from pdf_loader import load_pdf
10
-
11
- # --- Helpers ---
12
- def docs_to_text(docs) -> str:
13
- return "\n\n".join([d.page_content for d in docs])
14
-
15
- def run_splitter(text: str, splitter) -> List[str]:
16
- return splitter.split_text(text)
17
-
18
- def metrics(chunks: List[str]) -> Dict:
19
- if not chunks:
20
- return {"chunks": 0, "avg_len": 0, "max_len": 0}
21
- lens = [len(c) for c in chunks]
22
- return {
23
- "chunks": len(chunks),
24
- "avg_len": round(mean(lens), 1),
25
- "max_len": max(lens),
26
- }
27
-
28
- # --- Strategy evaluation ---
29
- def evaluate_strategies(
30
- text: str,
31
- char_size: int = 800,
32
- char_overlap: int = 100,
33
- token_size: int = 512,
34
- token_overlap: int = 64,
35
- ) -> Dict[str, Dict]:
36
- fixed = CharacterTextSplitter(chunk_size=char_size, chunk_overlap=char_overlap, separator="\n")
37
- recursive = RecursiveCharacterTextSplitter(
38
- chunk_size=char_size, chunk_overlap=char_overlap, separators=["\n\n", "\n", " ", ""]
39
- )
40
- token = TokenTextSplitter(chunk_size=token_size, chunk_overlap=token_overlap)
41
-
42
- fixed_chunks = run_splitter(text, fixed)
43
- rec_chunks = run_splitter(text, recursive)
44
- tok_chunks = run_splitter(text, token)
45
-
46
- return {
47
- "fixed": {"chunks": fixed_chunks, "metrics": metrics(fixed_chunks), "meta": {"size": char_size, "overlap": char_overlap, "unit": "chars"}},
48
- "recursive": {"chunks": rec_chunks, "metrics": metrics(rec_chunks), "meta": {"size": char_size, "overlap": char_overlap, "unit": "chars"}},
49
- "token": {"chunks": tok_chunks, "metrics": metrics(tok_chunks), "meta": {"size": token_size, "overlap": token_overlap, "unit": "tokens"}},
50
- }
51
-
52
- def score(candidate: Dict, target_avg: int = 800, hard_max: int = 1500) -> float:
53
- """Lower is better: distance to target + penalty if max chunk too large."""
54
- m = candidate["metrics"]
55
- dist = abs(m["avg_len"] - target_avg)
56
- penalty = 0 if m["max_len"] <= hard_max else (m["max_len"] - hard_max)
57
- # Favor more, smaller chunks over 1 giant chunk
58
- few_chunk_penalty = 500 if m["chunks"] <= 1 else 0
59
- return dist + penalty + few_chunk_penalty
60
-
61
- def select_best(evals: Dict[str, Dict], target_avg: int = 800, hard_max: int = 1500) -> Tuple[str, Dict]:
62
- scored = [(name, score(info, target_avg, hard_max)) for name, info in evals.items()]
63
- scored.sort(key=lambda x: x[1])
64
- return scored[0][0], evals[scored[0][0]]
65
-
66
- # --- Final pipeline API ---
67
- class OptimalChunker:
68
- def __init__(
69
- self,
70
- char_size: int = 800,
71
- char_overlap: int = 100,
72
- token_size: int = 512,
73
- token_overlap: int = 64,
74
- target_avg: int = 800,
75
- hard_max: int = 1500,
76
- ):
77
- self.char_size = char_size
78
- self.char_overlap = char_overlap
79
- self.token_size = token_size
80
- self.token_overlap = token_overlap
81
- self.target_avg = target_avg
82
- self.hard_max = hard_max
83
- self.best_name = None
84
- self.best_info = None
85
-
86
- def fit_on_text(self, text: str) -> Dict:
87
- evals = evaluate_strategies(
88
- text,
89
- char_size=self.char_size,
90
- char_overlap=self.char_overlap,
91
- token_size=self.token_size,
92
- token_overlap=self.token_overlap,
93
- )
94
- self.best_name, self.best_info = select_best(evals, self.target_avg, self.hard_max)
95
- return {"best": self.best_name, "metrics": self.best_info["metrics"], "meta": self.best_info["meta"]}
96
-
97
- def transform(self) -> List[str]:
98
- assert self.best_info is not None, "Call fit_on_text first."
99
- return self.best_info["chunks"]
100
-
101
- def fit_transform_pdf(self, pdf_path: str) -> Tuple[str, List[str], Dict]:
102
- docs = load_pdf(pdf_path)
103
- text = docs_to_text(docs)
104
- summary = self.fit_on_text(text)
105
- return self.best_name, self.transform(), summary
106
-
107
- if __name__ == "__main__":
108
- # Demo on sample.pdf
109
- ch = OptimalChunker()
110
- best, chunks, summary = ch.fit_transform_pdf("sample.pdf")
111
- print("=== Best Strategy ===")
112
- print(best, summary)
113
- print(f"First chunk preview:\n{chunks[0][:300] if chunks else ''}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
day3/pdf_loader.py DELETED
@@ -1,13 +0,0 @@
1
- from langchain_community.document_loaders import PyPDFLoader
2
-
3
- def load_pdf(file_path):
4
- loader = PyPDFLoader(file_path)
5
- pages = loader.load()
6
- return pages
7
-
8
- if __name__ == "__main__":
9
- docs = load_pdf("sample.pdf")
10
- print(f"✅ Loaded {len(docs)} pages")
11
- for i, page in enumerate(docs, start=1):
12
- print(f"--- Page {i} ---")
13
- print(page.page_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
day3/rag_system.py DELETED
@@ -1,86 +0,0 @@
1
- # day3/rag_system.py
2
- from typing import List, Dict
3
- import os
4
- import chromadb
5
-
6
- from pdf_loader import load_pdf
7
- from optimal_chunker import OptimalChunker
8
- from embeddings import embed_texts
9
-
10
- # LLM-i sonradan yaradacağıq
11
- from langchain_groq import ChatGroq
12
- from dotenv import load_dotenv
13
-
14
- load_dotenv()
15
-
16
-
17
- class RAGPipeline:
18
- def __init__(self, persist_dir: str = "./chroma_db_space", collection_name: str = "pdf_docs"):
19
- # Vector DB (Chroma 1.x)
20
- self.client = chromadb.PersistentClient(path=persist_dir)
21
- self.col = self.client.get_or_create_collection(name=collection_name)
22
-
23
- # Chunker
24
- self.chunker = OptimalChunker()
25
-
26
- # LLM hələ YARADILMIR (lazy)
27
- self.llm = None
28
-
29
- def _ensure_llm(self):
30
- """GROQ_API_KEY varsa LLM-i gec (ilk sorğuda) yarat."""
31
- if self.llm is None:
32
- api_key = os.getenv("GROQ_API_KEY")
33
- if not api_key:
34
- # LLM olmadan da app işə düşsün deyə aydın mesaj veririk
35
- raise RuntimeError(
36
- "GROQ_API_KEY tapılmadı. Space Settings → Variables and secrets bölməsində əlavə edin."
37
- )
38
- self.llm = ChatGroq(model="llama3-8b-8192", temperature=0.0, api_key=api_key)
39
-
40
- # 1) Load 2) Chunk 3) Embed 4) Upsert to Chroma
41
- def index_document(self, pdf_path: str, doc_id_prefix: str = "doc") -> Dict:
42
- docs = load_pdf(pdf_path)
43
- text = "\n\n".join(d.page_content for d in docs)
44
-
45
- summary = self.chunker.fit_on_text(text)
46
- chunks = self.chunker.transform()
47
-
48
- vectors = embed_texts(chunks) # list[list[float]]
49
- ids = [f"{doc_id_prefix}-{i}" for i in range(len(chunks))]
50
-
51
- self.col.add(
52
- ids=ids,
53
- documents=chunks,
54
- embeddings=vectors,
55
- metadatas=[{"source": pdf_path, "chunk": i} for i in range(len(chunks))],
56
- )
57
- return {"chunks_indexed": len(chunks), "best_strategy": summary}
58
-
59
- # 5) Retrieve 6) Ask LLM
60
- def query(self, question: str, k: int = 4) -> Dict:
61
- # LLM-i bu zaman yaradacağıq (secret yoxdursa burda aydın xəta görünəcək)
62
- self._ensure_llm()
63
-
64
- results = self.col.query(query_texts=[question], n_results=k)
65
- chunks: List[str] = results["documents"][0] if results.get("documents") else []
66
-
67
- context = "\n\n".join(chunks)
68
- prompt = f"""You are an extraction assistant. Use ONLY the Context to answer.
69
- Rules:
70
- - If the answer is explicitly present in Context, return that substring EXACTLY.
71
- - Do not paraphrase. Do not add words. Return a verbatim span from Context.
72
- - If the answer is not in Context, reply exactly: I don't know
73
-
74
- Question: {question}
75
-
76
- Context:
77
- {context}
78
-
79
- Answer (verbatim from Context):"""
80
- resp = self.llm.invoke(prompt)
81
- answer = resp.content.strip() if hasattr(resp, "content") else str(resp)
82
-
83
- if (not answer or answer.lower().startswith("i don't know")) and context.strip():
84
- answer = chunks[0] if chunks else "I don't know"
85
-
86
- return {"answer": answer, "used_chunks": len(chunks), "context_preview": context[:500]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
day3/requirements.txt DELETED
@@ -1,6 +0,0 @@
1
- gradio
2
- chromadb
3
- sentence-transformers
4
- langchain-groq
5
- pypdf
6
- python-dotenv
 
 
 
 
 
 
 
day3/vector_store.py DELETED
@@ -1,15 +0,0 @@
1
- # vector_store.py
2
- import chromadb
3
-
4
- # New persistent client (replaces Settings / duckdb+parquet)
5
- client = chromadb.PersistentClient(path="./chroma_db")
6
-
7
- # Create or get collection
8
- collection = client.get_or_create_collection("pdf_docs")
9
-
10
- def reset_db():
11
- client.delete_collection("pdf_docs")
12
- return client.get_or_create_collection("pdf_docs")
13
-
14
- if __name__ == "__main__":
15
- print("ChromaDB ready. Collections:", [c.name for c in client.list_collections()])