File size: 1,114 Bytes
d82600f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
    """
    Split text into chunks using RecursiveCharacterTextSplitter
    
    Args:
        text: Text to split into chunks
        chunk_size: Size of each chunk
        chunk_overlap: Overlap between chunks
        
    Returns:
        List of text chunks
    """
    try:
        if not text or not text.strip():
            print("[WARNING] Empty or None text provided for chunking")
            return []
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        
        chunks = text_splitter.split_text(text)
        print(f"[INFO] Created {len(chunks)} chunks from text")
        return chunks
    except Exception as e:
        print(f"[ERROR] Text chunking failed: {e}")
        # Return the original text as a single chunk as fallback
        return [text] if text else []