# chunking_test.py from langchain.text_splitter import ( CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter, ) from pdf_loader import load_pdf # GPT/Copilot: "utility to flatten pages into a single string" def docs_to_text(docs): return "\n\n".join([d.page_content for d in docs]) # GPT/Copilot: "run a splitter on text and return list[str]" def split_text(text, splitter): return splitter.split_text(text) # GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting" def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None): if unit == "chars": sizes = [len(c) for c in chunks] avg = sum(sizes) / len(sizes) if sizes else 0 return { "chunks": len(chunks), "avg_chars": round(avg, 1), "overlap": chunk_overlap, } else: # token mode will pass unit="tokens" and precomputed token sizes if needed sizes = [len(c) for c in chunks] # placeholder, we’ll report char length anyway avg = sum(sizes) / len(sizes) if sizes else 0 return { "chunks": len(chunks), "avg_len_str": round(avg, 1), "overlap": chunk_overlap, } def run_comparison(pdf_path="sample.pdf"): docs = load_pdf(pdf_path) text = docs_to_text(docs) # 1) Fixed size (CharacterTextSplitter) fixed = CharacterTextSplitter( chunk_size=800, chunk_overlap=100, separator="\n" ) fixed_chunks = split_text(text, fixed) fixed_metrics = compute_metrics( fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100 ) # 2) Recursive (RecursiveCharacterTextSplitter) recursive = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", " ", ""], ) recursive_chunks = split_text(text, recursive) recursive_metrics = compute_metrics( recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100 ) # 3) Token-based (TokenTextSplitter) token = TokenTextSplitter( chunk_size=512, chunk_overlap=64, ) token_chunks = split_text(text, token) token_metrics = compute_metrics( token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64 ) print("=== Chunking Comparison ===") print("Fixed (chars): ", fixed_metrics) print("Recursive (chars):", recursive_metrics) print("Token-based: ", token_metrics) # Optional: show first chunk samples for sanity print("\n--- Sample Chunks ---") for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]: preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "") print(f"{name} #1 →", preview) if __name__ == "__main__": run_comparison("sample.pdf")