def chunk_by_token_limit(text, max_tokens, tokenizer=None): from nltk.tokenize import sent_tokenize if tokenizer is None: from transformers import T5Tokenizer tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer") sentences = sent_tokenize(text) chunks = [] current_chunk = "" current_token_count = 0 for sentence in sentences: token_count = len(tokenizer.tokenize(sentence)) if current_token_count + token_count > max_tokens: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence current_token_count = token_count else: if current_chunk: current_chunk += " " + sentence else: current_chunk = sentence current_token_count += token_count if current_chunk: chunks.append(current_chunk.strip()) return chunks