File size: 961 Bytes
fcc0ada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def chunk_by_token_limit(text, max_tokens, tokenizer=None):
    from nltk.tokenize import sent_tokenize
    if tokenizer is None:
        from transformers import T5Tokenizer
        tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")

    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_token_count = 0

    for sentence in sentences:
        token_count = len(tokenizer.tokenize(sentence))
        if current_token_count + token_count > max_tokens:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_token_count = token_count
        else:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence
            current_token_count += token_count

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks