Spaces:
Sleeping
Sleeping
def chunk_by_token_limit(text, max_tokens, tokenizer=None): | |
import logging | |
from nltk.tokenize import sent_tokenize | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger("chunker") | |
if tokenizer is None: | |
from transformers import T5Tokenizer | |
logger.info("π Loading default tokenizer: VincentMuriuki/legal-summarizer") | |
tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer") | |
logger.info("π§ Starting chunking process...") | |
sentences = sent_tokenize(text) | |
logger.info(f"π Total sentences found: {len(sentences)}") | |
chunks = [] | |
current_chunk = "" | |
current_token_count = 0 | |
for idx, sentence in enumerate(sentences): | |
token_count = len(tokenizer.tokenize(sentence)) | |
logger.debug(f"π Sentence {idx + 1}: {token_count} tokens") | |
if current_token_count + token_count > max_tokens: | |
if current_chunk: | |
logger.info(f"βοΈ Chunk complete with {current_token_count} tokens") | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
current_token_count = token_count | |
logger.info(f"π§ Starting new chunk with sentence {idx + 1}") | |
else: | |
if current_chunk: | |
current_chunk += " " + sentence | |
else: | |
current_chunk = sentence | |
current_token_count += token_count | |
if current_chunk: | |
logger.info(f"β Final chunk complete with {current_token_count} tokens") | |
chunks.append(current_chunk.strip()) | |
logger.info(f"π¦ Total chunks created: {len(chunks)}") | |
return chunks | |