sagar008 commited on
Commit
fcc0ada
·
verified ·
1 Parent(s): 17d6cf2

Create chunker.py

Browse files

added chunker.py file so to tokenise raw preprocessed text into small chunk

Files changed (1) hide show
  1. chunker.py +30 -0
chunker.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def chunk_by_token_limit(text, max_tokens, tokenizer=None):
2
+ from nltk.tokenize import sent_tokenize
3
+ if tokenizer is None:
4
+ from transformers import T5Tokenizer
5
+ tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")
6
+
7
+ sentences = sent_tokenize(text)
8
+ chunks = []
9
+ current_chunk = ""
10
+ current_token_count = 0
11
+
12
+ for sentence in sentences:
13
+ token_count = len(tokenizer.tokenize(sentence))
14
+ if current_token_count + token_count > max_tokens:
15
+ if current_chunk:
16
+ chunks.append(current_chunk.strip())
17
+ current_chunk = sentence
18
+ current_token_count = token_count
19
+ else:
20
+ if current_chunk:
21
+ current_chunk += " " + sentence
22
+ else:
23
+ current_chunk = sentence
24
+ current_token_count += token_count
25
+
26
+ if current_chunk:
27
+ chunks.append(current_chunk.strip())
28
+
29
+ return chunks
30
+