"""Chunking token‑based.""" | |
from typing import List | |
import tiktoken | |
from .config import CHUNK_SIZE, CHUNK_OVERLAP | |
_tok = tiktoken.get_encoding("cl100k_base") | |
def chunk_text(text: str, | |
max_tokens: int = CHUNK_SIZE, | |
overlap: int = CHUNK_OVERLAP) -> List[str]: | |
tokens = _tok.encode(text) | |
out, start, step = [], 0, max_tokens - overlap | |
while start < len(tokens): | |
end = min(start + max_tokens, len(tokens)) | |
out.append(_tok.decode(tokens[start:end])) | |
start += step | |
return out | |