John-Jiang's picture
init commit
5301c48
from typing import List
import re
from starfish.data_ingest.splitter.base_splitter import TextSplitter
class TextChunkSplitter(TextSplitter):
"""Splitting text into chunks with optional overlap and minimum size constraints."""
def __init__(
self,
chunk_size: int = 400,
overlap: int = 20,
min_chunk_size: int = 100,
) -> None:
"""Create a new TextChunkSplitter.
Args:
chunk_size: Maximum size of each chunk
overlap: Number of characters to overlap between chunks
min_chunk_size: Minimum acceptable chunk size (avoids tiny final chunks)
"""
self._chunk_size = chunk_size
self._overlap = overlap
self._min_chunk_size = min_chunk_size
def split_text(self, text: str) -> List[str]:
"""Split text into chunks with optional overlap.
Args:
text: Input text to split
Returns:
List of text chunks
"""
# Normalize whitespace and handle different paragraph separators
text = re.sub(r"\n{2,}", "\n\n", text.strip())
paragraphs = text.split("\n\n")
chunks = []
current_chunk = ""
for para in paragraphs:
# Skip empty paragraphs
if not para.strip():
continue
# If adding this paragraph would exceed chunk size
if current_chunk and len(current_chunk) + len(para) > self._chunk_size:
# Ensure we don't create chunks smaller than min_chunk_size
if len(current_chunk) >= self._min_chunk_size:
chunks.append(current_chunk)
# Create overlap using sentence boundaries
sentences = [s for s in re.split(r"(?<=[.!?])\s+", current_chunk) if s]
overlap_text = ""
# Add sentences until we reach the desired overlap
for sentence in reversed(sentences):
if len(overlap_text) + len(sentence) <= self._overlap:
overlap_text = sentence + " " + overlap_text
else:
break
current_chunk = overlap_text.strip() + "\n\n" + para
else:
# If chunk is too small, keep adding to it
current_chunk += "\n\n" + para
else:
current_chunk += ("\n\n" + para) if current_chunk else para
# Add the final chunk if it meets minimum size
if current_chunk and len(current_chunk) >= self._min_chunk_size:
chunks.append(current_chunk)
return chunks