Spaces:
Sleeping
Sleeping
Update chunker.py
Browse filesadding logs for debug
- chunker.py +17 -1
chunker.py
CHANGED
@@ -1,21 +1,34 @@
|
|
1 |
def chunk_by_token_limit(text, max_tokens, tokenizer=None):
|
|
|
2 |
from nltk.tokenize import sent_tokenize
|
|
|
|
|
|
|
|
|
3 |
if tokenizer is None:
|
4 |
from transformers import T5Tokenizer
|
|
|
5 |
tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
|
6 |
|
|
|
7 |
sentences = sent_tokenize(text)
|
|
|
|
|
8 |
chunks = []
|
9 |
current_chunk = ""
|
10 |
current_token_count = 0
|
11 |
|
12 |
-
for sentence in sentences:
|
13 |
token_count = len(tokenizer.tokenize(sentence))
|
|
|
|
|
14 |
if current_token_count + token_count > max_tokens:
|
15 |
if current_chunk:
|
|
|
16 |
chunks.append(current_chunk.strip())
|
17 |
current_chunk = sentence
|
18 |
current_token_count = token_count
|
|
|
19 |
else:
|
20 |
if current_chunk:
|
21 |
current_chunk += " " + sentence
|
@@ -24,7 +37,10 @@ def chunk_by_token_limit(text, max_tokens, tokenizer=None):
|
|
24 |
current_token_count += token_count
|
25 |
|
26 |
if current_chunk:
|
|
|
27 |
chunks.append(current_chunk.strip())
|
28 |
|
|
|
29 |
return chunks
|
30 |
|
|
|
|
1 |
def chunk_by_token_limit(text, max_tokens, tokenizer=None):
|
2 |
+
import logging
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
+
|
5 |
+
logging.basicConfig(level=logging.INFO)
|
6 |
+
logger = logging.getLogger("chunker")
|
7 |
+
|
8 |
if tokenizer is None:
|
9 |
from transformers import T5Tokenizer
|
10 |
+
logger.info("π Loading default tokenizer: VincentMuriuki/legal-summarizer")
|
11 |
tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
|
12 |
|
13 |
+
logger.info("π§ Starting chunking process...")
|
14 |
sentences = sent_tokenize(text)
|
15 |
+
logger.info(f"π Total sentences found: {len(sentences)}")
|
16 |
+
|
17 |
chunks = []
|
18 |
current_chunk = ""
|
19 |
current_token_count = 0
|
20 |
|
21 |
+
for idx, sentence in enumerate(sentences):
|
22 |
token_count = len(tokenizer.tokenize(sentence))
|
23 |
+
logger.debug(f"π Sentence {idx + 1}: {token_count} tokens")
|
24 |
+
|
25 |
if current_token_count + token_count > max_tokens:
|
26 |
if current_chunk:
|
27 |
+
logger.info(f"βοΈ Chunk complete with {current_token_count} tokens")
|
28 |
chunks.append(current_chunk.strip())
|
29 |
current_chunk = sentence
|
30 |
current_token_count = token_count
|
31 |
+
logger.info(f"π§ Starting new chunk with sentence {idx + 1}")
|
32 |
else:
|
33 |
if current_chunk:
|
34 |
current_chunk += " " + sentence
|
|
|
37 |
current_token_count += token_count
|
38 |
|
39 |
if current_chunk:
|
40 |
+
logger.info(f"β
Final chunk complete with {current_token_count} tokens")
|
41 |
chunks.append(current_chunk.strip())
|
42 |
|
43 |
+
logger.info(f"π¦ Total chunks created: {len(chunks)}")
|
44 |
return chunks
|
45 |
|
46 |
+
|