File size: 544 Bytes
dd58f3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
"""Chunking token‑based."""
from typing import List
import tiktoken
from .config import CHUNK_SIZE, CHUNK_OVERLAP
_tok = tiktoken.get_encoding("cl100k_base")
def chunk_text(text: str,
max_tokens: int = CHUNK_SIZE,
overlap: int = CHUNK_OVERLAP) -> List[str]:
tokens = _tok.encode(text)
out, start, step = [], 0, max_tokens - overlap
while start < len(tokens):
end = min(start + max_tokens, len(tokens))
out.append(_tok.decode(tokens[start:end]))
start += step
return out
|