nyaynetra-summarizer / chunker.py
sagar008's picture
Update chunker.py
c44d8fc verified
def chunk_by_token_limit(text, max_tokens, tokenizer=None):
import logging
from nltk.tokenize import sent_tokenize
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("chunker")
if tokenizer is None:
from transformers import T5Tokenizer
logger.info("πŸ”„ Loading default tokenizer: VincentMuriuki/legal-summarizer")
tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
logger.info("🧠 Starting chunking process...")
sentences = sent_tokenize(text)
logger.info(f"πŸ“„ Total sentences found: {len(sentences)}")
chunks = []
current_chunk = ""
current_token_count = 0
for idx, sentence in enumerate(sentences):
token_count = len(tokenizer.tokenize(sentence))
logger.debug(f"πŸ” Sentence {idx + 1}: {token_count} tokens")
if current_token_count + token_count > max_tokens:
if current_chunk:
logger.info(f"βœ‚οΈ Chunk complete with {current_token_count} tokens")
chunks.append(current_chunk.strip())
current_chunk = sentence
current_token_count = token_count
logger.info(f"🚧 Starting new chunk with sentence {idx + 1}")
else:
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
current_token_count += token_count
if current_chunk:
logger.info(f"βœ… Final chunk complete with {current_token_count} tokens")
chunks.append(current_chunk.strip())
logger.info(f"πŸ“¦ Total chunks created: {len(chunks)}")
return chunks