nyaynetra-summarizer / chunker.py
sagar008's picture
Create chunker.py
fcc0ada verified
raw
history blame
961 Bytes
def chunk_by_token_limit(text, max_tokens, tokenizer=None):
from nltk.tokenize import sent_tokenize
if tokenizer is None:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
sentences = sent_tokenize(text)
chunks = []
current_chunk = ""
current_token_count = 0
for sentence in sentences:
token_count = len(tokenizer.tokenize(sentence))
if current_token_count + token_count > max_tokens:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
current_token_count = token_count
else:
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
current_token_count += token_count
if current_chunk:
chunks.append(current_chunk.strip())
return chunks