def chunk_by_token_limit(text, max_tokens, tokenizer=None): import logging from nltk.tokenize import sent_tokenize logging.basicConfig(level=logging.INFO) logger = logging.getLogger("chunker") if tokenizer is None: from transformers import T5Tokenizer logger.info("🔄 Loading default tokenizer: VincentMuriuki/legal-summarizer") tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer") logger.info("🧠 Starting chunking process...") sentences = sent_tokenize(text) logger.info(f"📄 Total sentences found: {len(sentences)}") chunks = [] current_chunk = "" current_token_count = 0 for idx, sentence in enumerate(sentences): token_count = len(tokenizer.tokenize(sentence)) logger.debug(f"🔍 Sentence {idx + 1}: {token_count} tokens") if current_token_count + token_count > max_tokens: if current_chunk: logger.info(f"✂️ Chunk complete with {current_token_count} tokens") chunks.append(current_chunk.strip()) current_chunk = sentence current_token_count = token_count logger.info(f"🚧 Starting new chunk with sentence {idx + 1}") else: if current_chunk: current_chunk += " " + sentence else: current_chunk = sentence current_token_count += token_count if current_chunk: logger.info(f"✅ Final chunk complete with {current_token_count} tokens") chunks.append(current_chunk.strip()) logger.info(f"📦 Total chunks created: {len(chunks)}") return chunks