Spaces:
Sleeping
Sleeping
# chunking_test.py | |
from langchain.text_splitter import ( | |
CharacterTextSplitter, | |
RecursiveCharacterTextSplitter, | |
TokenTextSplitter, | |
) | |
from pdf_loader import load_pdf | |
# GPT/Copilot: "utility to flatten pages into a single string" | |
def docs_to_text(docs): | |
return "\n\n".join([d.page_content for d in docs]) | |
# GPT/Copilot: "run a splitter on text and return list[str]" | |
def split_text(text, splitter): | |
return splitter.split_text(text) | |
# GPT/Copilot: "compute metrics: chunk count, average size (chars or tokens), and overlap setting" | |
def compute_metrics(chunks, unit="chars", chunk_size=None, chunk_overlap=None): | |
if unit == "chars": | |
sizes = [len(c) for c in chunks] | |
avg = sum(sizes) / len(sizes) if sizes else 0 | |
return { | |
"chunks": len(chunks), | |
"avg_chars": round(avg, 1), | |
"overlap": chunk_overlap, | |
} | |
else: | |
# token mode will pass unit="tokens" and precomputed token sizes if needed | |
sizes = [len(c) for c in chunks] # placeholder, we’ll report char length anyway | |
avg = sum(sizes) / len(sizes) if sizes else 0 | |
return { | |
"chunks": len(chunks), | |
"avg_len_str": round(avg, 1), | |
"overlap": chunk_overlap, | |
} | |
def run_comparison(pdf_path="sample.pdf"): | |
docs = load_pdf(pdf_path) | |
text = docs_to_text(docs) | |
# 1) Fixed size (CharacterTextSplitter) | |
fixed = CharacterTextSplitter( | |
chunk_size=800, chunk_overlap=100, separator="\n" | |
) | |
fixed_chunks = split_text(text, fixed) | |
fixed_metrics = compute_metrics( | |
fixed_chunks, unit="chars", chunk_size=800, chunk_overlap=100 | |
) | |
# 2) Recursive (RecursiveCharacterTextSplitter) | |
recursive = RecursiveCharacterTextSplitter( | |
chunk_size=800, | |
chunk_overlap=100, | |
separators=["\n\n", "\n", " ", ""], | |
) | |
recursive_chunks = split_text(text, recursive) | |
recursive_metrics = compute_metrics( | |
recursive_chunks, unit="chars", chunk_size=800, chunk_overlap=100 | |
) | |
# 3) Token-based (TokenTextSplitter) | |
token = TokenTextSplitter( | |
chunk_size=512, | |
chunk_overlap=64, | |
) | |
token_chunks = split_text(text, token) | |
token_metrics = compute_metrics( | |
token_chunks, unit="tokens", chunk_size=512, chunk_overlap=64 | |
) | |
print("=== Chunking Comparison ===") | |
print("Fixed (chars): ", fixed_metrics) | |
print("Recursive (chars):", recursive_metrics) | |
print("Token-based: ", token_metrics) | |
# Optional: show first chunk samples for sanity | |
print("\n--- Sample Chunks ---") | |
for name, chunks in [("Fixed", fixed_chunks), ("Recursive", recursive_chunks), ("Token", token_chunks)]: | |
preview = chunks[0][:200].replace("\n", " ") + ("..." if len(chunks[0]) > 200 else "") | |
print(f"{name} #1 →", preview) | |
if __name__ == "__main__": | |
run_comparison("sample.pdf") | |