Spaces:
Sleeping
Sleeping
def chunk_by_token_limit(text, max_tokens, tokenizer=None): | |
from nltk.tokenize import sent_tokenize | |
if tokenizer is None: | |
from transformers import T5Tokenizer | |
tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer") | |
sentences = sent_tokenize(text) | |
chunks = [] | |
current_chunk = "" | |
current_token_count = 0 | |
for sentence in sentences: | |
token_count = len(tokenizer.tokenize(sentence)) | |
if current_token_count + token_count > max_tokens: | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
current_token_count = token_count | |
else: | |
if current_chunk: | |
current_chunk += " " + sentence | |
else: | |
current_chunk = sentence | |
current_token_count += token_count | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |