File size: 2,915 Bytes
e7e9247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# chunking_test.py
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)
from pdf_loader import load_pdf

# GPT/Copilot: "utility to flatten pages into a single string"
def docs_to_text(docs):
    return "\n\n".join([d.page_content for d in docs])

# GPT/Copilot: "run a splitter on text and return list[str]"
def split_text(text, splitter):
    return splitter.split_text(text)

# GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting"
def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None):
    if unit == "chars":
        sizes = [len(c) for c in chunks]
        avg = sum(sizes) / len(sizes) if sizes else 0
        return {
            "chunks": len(chunks),
            "avg_chars": round(avg, 1),
            "overlap": chunk_overlap,
        }
    else:
        # token mode will pass unit="tokens" and precomputed token sizes if needed
        sizes = [len(c) for c in chunks]  # placeholder, we’ll report char length anyway
        avg = sum(sizes) / len(sizes) if sizes else 0
        return {
            "chunks": len(chunks),
            "avg_len_str": round(avg, 1),
            "overlap": chunk_overlap,
        }

def run_comparison(pdf_path="sample.pdf"):
    docs = load_pdf(pdf_path)
    text = docs_to_text(docs)

    # 1) Fixed size (CharacterTextSplitter)
    fixed = CharacterTextSplitter(
        chunk_size=800, chunk_overlap=100, separator="\n"
    )
    fixed_chunks = split_text(text, fixed)
    fixed_metrics = compute_metrics(
        fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100
    )

    # 2) Recursive (RecursiveCharacterTextSplitter)
    recursive = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", " ", ""],
    )
    recursive_chunks = split_text(text, recursive)
    recursive_metrics = compute_metrics(
        recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100
    )

    # 3) Token-based (TokenTextSplitter)
    token = TokenTextSplitter(
        chunk_size=512,
        chunk_overlap=64,
    )
    token_chunks = split_text(text, token)
    token_metrics = compute_metrics(
        token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64
    )

    print("=== Chunking Comparison ===")
    print("Fixed (chars):   ", fixed_metrics)
    print("Recursive (chars):", recursive_metrics)
    print("Token-based:      ", token_metrics)

    # Optional: show first chunk samples for sanity
    print("\n--- Sample Chunks ---")
    for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]:
        preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "")
        print(f"{name} #1 →", preview)

if __name__ == "__main__":
    run_comparison("sample.pdf")