File size: 1,671 Bytes
fcc0ada
c44d8fc
fcc0ada
c44d8fc
 
 
 
fcc0ada
 
c44d8fc
fcc0ada
 
c44d8fc
fcc0ada
c44d8fc
 
fcc0ada
 
 
 
c44d8fc
fcc0ada
c44d8fc
 
fcc0ada
 
c44d8fc
fcc0ada
 
 
c44d8fc
fcc0ada
 
 
 
 
 
 
 
c44d8fc
fcc0ada
 
c44d8fc
fcc0ada
 
c44d8fc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def chunk_by_token_limit(text, max_tokens, tokenizer=None):
    import logging
    from nltk.tokenize import sent_tokenize

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("chunker")

    if tokenizer is None:
        from transformers import T5Tokenizer
        logger.info("πŸ”„ Loading default tokenizer: VincentMuriuki/legal-summarizer")
        tokenizer = T5Tokenizer.from_pretrained("VincentMuriuki/legal-summarizer")

    logger.info("🧠 Starting chunking process...")
    sentences = sent_tokenize(text)
    logger.info(f"πŸ“„ Total sentences found: {len(sentences)}")

    chunks = []
    current_chunk = ""
    current_token_count = 0

    for idx, sentence in enumerate(sentences):
        token_count = len(tokenizer.tokenize(sentence))
        logger.debug(f"πŸ” Sentence {idx + 1}: {token_count} tokens")

        if current_token_count + token_count > max_tokens:
            if current_chunk:
                logger.info(f"βœ‚οΈ Chunk complete with {current_token_count} tokens")
                chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_token_count = token_count
            logger.info(f"🚧 Starting new chunk with sentence {idx + 1}")
        else:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence
            current_token_count += token_count

    if current_chunk:
        logger.info(f"βœ… Final chunk complete with {current_token_count} tokens")
        chunks.append(current_chunk.strip())

    logger.info(f"πŸ“¦ Total chunks created: {len(chunks)}")
    return chunks