File size: 688 Bytes
ef4c8c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import re

def segment_paragraphs(text: str) -> list:
    """Segment text into paragraphs using double newlines or similar heuristics."""
    if not isinstance(text, str):
        return []
    # Split on two or more newlines, or at least 200 chars per paragraph
    paras = re.split(r"\n{2,}", text)
    # Fallback: split-long paragraphs
    result = []
    for para in paras:
        para = para.strip()
        if len(para) > 1000:
            # Split further if too long
            chunks = [para[i:i+1000] for i in range(0, len(para), 1000)]
            result.extend(chunks)
        elif para:
            result.append(para)
    return [p for p in result if p]