Chunkings / src /chunker.py
Marcos Morales
modified: README.md
dd58f3d
raw
history blame contribute delete
544 Bytes
"""Chunking token‑based."""
from typing import List
import tiktoken
from .config import CHUNK_SIZE, CHUNK_OVERLAP
_tok = tiktoken.get_encoding("cl100k_base")
def chunk_text(text: str,
max_tokens: int = CHUNK_SIZE,
overlap: int = CHUNK_OVERLAP) -> List[str]:
tokens = _tok.encode(text)
out, start, step = [], 0, max_tokens - overlap
while start < len(tokens):
end = min(start + max_tokens, len(tokens))
out.append(_tok.decode(tokens[start:end]))
start += step
return out