File size: 544 Bytes
dd58f3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
"""Chunking token‑based."""
from typing import List
import tiktoken
from .config import CHUNK_SIZE, CHUNK_OVERLAP

_tok = tiktoken.get_encoding("cl100k_base")

def chunk_text(text: str,
               max_tokens: int = CHUNK_SIZE,
               overlap: int = CHUNK_OVERLAP) -> List[str]:
    tokens = _tok.encode(text)
    out, start, step = [], 0, max_tokens - overlap
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        out.append(_tok.decode(tokens[start:end]))
        start += step
    return out