Spaces:

Merlintxu
/

Chunkings

Sleeping

App Files Files Community

Chunkings / src /chunker.py

Marcos Morales

modified: README.md

dd58f3d 24 days ago

history blame contribute delete

544 Bytes

	"""Chunking token‑based."""
	from typing import List
	import tiktoken
	from .config import CHUNK_SIZE, CHUNK_OVERLAP

	_tok = tiktoken.get_encoding("cl100k_base")

	def chunk_text(text: str,
	max_tokens: int = CHUNK_SIZE,
	overlap: int = CHUNK_OVERLAP) -> List[str]:
	tokens = _tok.encode(text)
	out, start, step = [], 0, max_tokens - overlap
	while start < len(tokens):
	end = min(start + max_tokens, len(tokens))
	out.append(_tok.decode(tokens[start:end]))
	start += step
	return out