Spaces:

sagar008
/

nyaynetra-summarizer

Sleeping

nyaynetra-summarizer / chunker.py

Update chunker.py

c44d8fc verified 3 months ago

1.67 kB

	def chunk_by_token_limit(text, max_tokens, tokenizer=None):
	import logging
	from nltk.tokenize import sent_tokenize

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("chunker")

	if tokenizer is None:
	from transformers import T5Tokenizer
	logger.info("🔄 Loading default tokenizer: VincentMuriuki/legal-summarizer")
	tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")

	logger.info("🧠 Starting chunking process...")
	sentences = sent_tokenize(text)
	logger.info(f"📄 Total sentences found: {len(sentences)}")

	chunks = []
	current_chunk = ""
	current_token_count = 0

	for idx, sentence in enumerate(sentences):
	token_count = len(tokenizer.tokenize(sentence))
	logger.debug(f"🔍 Sentence {idx + 1}: {token_count} tokens")

	if current_token_count + token_count > max_tokens:
	if current_chunk:
	logger.info(f"✂️ Chunk complete with {current_token_count} tokens")
	chunks.append(current_chunk.strip())
	current_chunk = sentence
	current_token_count = token_count
	logger.info(f"🚧 Starting new chunk with sentence {idx + 1}")
	else:
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence
	current_token_count += token_count

	if current_chunk:
	logger.info(f"✅ Final chunk complete with {current_token_count} tokens")
	chunks.append(current_chunk.strip())

	logger.info(f"📦 Total chunks created: {len(chunks)}")
	return chunks