First_RAG_System / day3 /chunking_test.py
Hamid Omarov
HF Space app + minimal pipeline code (no secrets)
e7e9247
raw
history blame
2.92 kB
# chunking_test.py
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
TokenTextSplitter,
)
from pdf_loader import load_pdf
# GPT/Copilot: "utility to flatten pages into a single string"
def docs_to_text(docs):
return "\n\n".join([d.page_content for d in docs])
# GPT/Copilot: "run a splitter on text and return list[str]"
def split_text(text, splitter):
return splitter.split_text(text)
# GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting"
def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None):
if unit == "chars":
sizes = [len(c) for c in chunks]
avg = sum(sizes) / len(sizes) if sizes else 0
return {
"chunks": len(chunks),
"avg_chars": round(avg, 1),
"overlap": chunk_overlap,
}
else:
# token mode will pass unit="tokens" and precomputed token sizes if needed
sizes = [len(c) for c in chunks] # placeholder, we’ll report char length anyway
avg = sum(sizes) / len(sizes) if sizes else 0
return {
"chunks": len(chunks),
"avg_len_str": round(avg, 1),
"overlap": chunk_overlap,
}
def run_comparison(pdf_path="sample.pdf"):
docs = load_pdf(pdf_path)
text = docs_to_text(docs)
# 1) Fixed size (CharacterTextSplitter)
fixed = CharacterTextSplitter(
chunk_size=800, chunk_overlap=100, separator="\n"
)
fixed_chunks = split_text(text, fixed)
fixed_metrics = compute_metrics(
fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100
)
# 2) Recursive (RecursiveCharacterTextSplitter)
recursive = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=100,
separators=["\n\n", "\n", " ", ""],
)
recursive_chunks = split_text(text, recursive)
recursive_metrics = compute_metrics(
recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100
)
# 3) Token-based (TokenTextSplitter)
token = TokenTextSplitter(
chunk_size=512,
chunk_overlap=64,
)
token_chunks = split_text(text, token)
token_metrics = compute_metrics(
token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64
)
print("=== Chunking Comparison ===")
print("Fixed (chars): ", fixed_metrics)
print("Recursive (chars):", recursive_metrics)
print("Token-based: ", token_metrics)
# Optional: show first chunk samples for sanity
print("\n--- Sample Chunks ---")
for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]:
preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "")
print(f"{name} #1 →", preview)
if __name__ == "__main__":
run_comparison("sample.pdf")