sagar008 commited on
Commit
c44d8fc
Β·
verified Β·
1 Parent(s): c07963e

Update chunker.py

Browse files

adding logs for debug

Files changed (1) hide show
  1. chunker.py +17 -1
chunker.py CHANGED
@@ -1,21 +1,34 @@
1
  def chunk_by_token_limit(text, max_tokens, tokenizer=None):
 
2
  from nltk.tokenize import sent_tokenize
 
 
 
 
3
  if tokenizer is None:
4
  from transformers import T5Tokenizer
 
5
  tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
6
 
 
7
  sentences = sent_tokenize(text)
 
 
8
  chunks = []
9
  current_chunk = ""
10
  current_token_count = 0
11
 
12
- for sentence in sentences:
13
  token_count = len(tokenizer.tokenize(sentence))
 
 
14
  if current_token_count + token_count > max_tokens:
15
  if current_chunk:
 
16
  chunks.append(current_chunk.strip())
17
  current_chunk = sentence
18
  current_token_count = token_count
 
19
  else:
20
  if current_chunk:
21
  current_chunk += " " + sentence
@@ -24,7 +37,10 @@ def chunk_by_token_limit(text, max_tokens, tokenizer=None):
24
  current_token_count += token_count
25
 
26
  if current_chunk:
 
27
  chunks.append(current_chunk.strip())
28
 
 
29
  return chunks
30
 
 
 
1
  def chunk_by_token_limit(text, max_tokens, tokenizer=None):
2
+ import logging
3
  from nltk.tokenize import sent_tokenize
4
+
5
+ logging.basicConfig(level=logging.INFO)
6
+ logger = logging.getLogger("chunker")
7
+
8
  if tokenizer is None:
9
  from transformers import T5Tokenizer
10
+ logger.info("πŸ”„ Loading default tokenizer: VincentMuriuki/legal-summarizer")
11
  tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
12
 
13
+ logger.info("🧠 Starting chunking process...")
14
  sentences = sent_tokenize(text)
15
+ logger.info(f"πŸ“„ Total sentences found: {len(sentences)}")
16
+
17
  chunks = []
18
  current_chunk = ""
19
  current_token_count = 0
20
 
21
+ for idx, sentence in enumerate(sentences):
22
  token_count = len(tokenizer.tokenize(sentence))
23
+ logger.debug(f"πŸ” Sentence {idx + 1}: {token_count} tokens")
24
+
25
  if current_token_count + token_count > max_tokens:
26
  if current_chunk:
27
+ logger.info(f"βœ‚οΈ Chunk complete with {current_token_count} tokens")
28
  chunks.append(current_chunk.strip())
29
  current_chunk = sentence
30
  current_token_count = token_count
31
+ logger.info(f"🚧 Starting new chunk with sentence {idx + 1}")
32
  else:
33
  if current_chunk:
34
  current_chunk += " " + sentence
 
37
  current_token_count += token_count
38
 
39
  if current_chunk:
40
+ logger.info(f"βœ… Final chunk complete with {current_token_count} tokens")
41
  chunks.append(current_chunk.strip())
42
 
43
+ logger.info(f"πŸ“¦ Total chunks created: {len(chunks)}")
44
  return chunks
45
 
46
+