File size: 11,404 Bytes
e4d5155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
"""
Semantic chunking for intelligent context segmentation.
"""

import logging
import uuid
from typing import List, Dict, Any, Optional, Tuple

from efficient_context.chunking.base import BaseChunker, Chunk
from efficient_context.utils.text import split_into_sentences, calculate_text_overlap

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SemanticChunker(BaseChunker):
    """
    Chunker that creates chunks based on semantic boundaries.
    
    This chunker aims to keep semantically related content together, unlike
    simple token-based chunking that might split content mid-thought.
    """
    
    def __init__(
        self,
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        respect_paragraphs: bool = True,
        min_chunk_size: int = 100,
        max_chunk_size: int = 1024
    ):
        """
        Initialize the SemanticChunker.
        
        Args:
            chunk_size: Target size for chunks in tokens (words)
            chunk_overlap: Number of tokens to overlap between chunks
            respect_paragraphs: Whether to avoid breaking paragraphs across chunks
            min_chunk_size: Minimum chunk size in tokens
            max_chunk_size: Maximum chunk size in tokens
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.respect_paragraphs = respect_paragraphs
        self.min_chunk_size = min_chunk_size
        self.max_chunk_size = max_chunk_size
        
        logger.info(
            "SemanticChunker initialized with target size: %d tokens, overlap: %d tokens",
            chunk_size, chunk_overlap
        )
    
    def _estimate_tokens(self, text: str) -> int:
        """
        Estimate the number of tokens in text.
        
        Args:
            text: Text to estimate tokens for
            
        Returns:
            token_count: Estimated number of tokens
        """
        # Simple whitespace-based token estimation
        # This is much faster than using a tokenizer and good enough for chunking
        return len(text.split())
    
    def _identify_paragraphs(self, content: str) -> List[str]:
        """
        Split content into paragraphs.
        
        Args:
            content: Content to split
            
        Returns:
            paragraphs: List of paragraphs
        """
        # Split on empty lines (common paragraph separator)
        paragraphs = [p.strip() for p in content.split("\n\n")]
        
        # Handle other kinds of paragraph breaks and clean up
        result = []
        current = ""
        
        for p in paragraphs:
            # Skip empty paragraphs
            if not p:
                continue
                
            # Handle single newlines that might indicate paragraphs
            lines = p.split("\n")
            for line in lines:
                if not line.strip():
                    if current:
                        result.append(current)
                        current = ""
                else:
                    if current:
                        current += " " + line.strip()
                    else:
                        current = line.strip()
            
            if current:
                result.append(current)
                current = ""
        
        # Add any remaining content
        if current:
            result.append(current)
        
        return result if result else [content]
    
    def _create_semantic_chunks(
        self,
        paragraphs: List[str],
        document_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None
    ) -> List[Chunk]:
        """
        Create chunks from paragraphs respecting semantic boundaries.
        
        Args:
            paragraphs: List of paragraphs to chunk
            document_id: Optional ID of the source document
            metadata: Optional metadata for the chunks
            
        Returns:
            chunks: List of Chunk objects
        """
        chunks = []
        current_chunk_text = ""
        current_token_count = 0
        
        for paragraph in paragraphs:
            paragraph_tokens = self._estimate_tokens(paragraph)
            
            # Check if adding this paragraph would exceed the max chunk size
            if (current_token_count + paragraph_tokens > self.max_chunk_size and 
                current_token_count >= self.min_chunk_size):
                # Create a new chunk with the current content
                chunk_id = str(uuid.uuid4())
                chunk = Chunk(
                    content=current_chunk_text.strip(),
                    chunk_id=chunk_id,
                    document_id=document_id,
                    metadata=metadata
                )
                chunks.append(chunk)
                
                # Start a new chunk with overlap
                if self.chunk_overlap > 0 and current_chunk_text:
                    # Get the last N tokens for overlap
                    words = current_chunk_text.split()
                    overlap_text = " ".join(words[-min(self.chunk_overlap, len(words)):])
                    current_chunk_text = overlap_text + " " + paragraph
                    current_token_count = self._estimate_tokens(current_chunk_text)
                else:
                    # No overlap
                    current_chunk_text = paragraph
                    current_token_count = paragraph_tokens
            # Handle very large paragraphs that exceed max_chunk_size on their own
            elif paragraph_tokens > self.max_chunk_size:
                # If we have existing content, create a chunk first
                if current_chunk_text:
                    chunk_id = str(uuid.uuid4())
                    chunk = Chunk(
                        content=current_chunk_text.strip(),
                        chunk_id=chunk_id,
                        document_id=document_id,
                        metadata=metadata
                    )
                    chunks.append(chunk)
                    current_chunk_text = ""
                    current_token_count = 0
                
                # Split the large paragraph into sentences
                sentences = split_into_sentences(paragraph)
                sentence_chunk = ""
                sentence_token_count = 0
                
                for sentence in sentences:
                    sentence_tokens = self._estimate_tokens(sentence)
                    
                    # Check if adding this sentence would exceed the max chunk size
                    if (sentence_token_count + sentence_tokens > self.max_chunk_size and 
                        sentence_token_count >= self.min_chunk_size):
                        # Create a new chunk with the current sentences
                        chunk_id = str(uuid.uuid4())
                        chunk = Chunk(
                            content=sentence_chunk.strip(),
                            chunk_id=chunk_id,
                            document_id=document_id,
                            metadata=metadata
                        )
                        chunks.append(chunk)
                        
                        # Start a new chunk with overlap
                        if self.chunk_overlap > 0 and sentence_chunk:
                            words = sentence_chunk.split()
                            overlap_text = " ".join(words[-min(self.chunk_overlap, len(words)):])
                            sentence_chunk = overlap_text + " " + sentence
                            sentence_token_count = self._estimate_tokens(sentence_chunk)
                        else:
                            sentence_chunk = sentence
                            sentence_token_count = sentence_tokens
                    else:
                        # Add the sentence to the current chunk
                        if sentence_chunk:
                            sentence_chunk += " " + sentence
                        else:
                            sentence_chunk = sentence
                        sentence_token_count += sentence_tokens
                
                # Add any remaining sentence content as a chunk
                if sentence_chunk:
                    chunk_id = str(uuid.uuid4())
                    chunk = Chunk(
                        content=sentence_chunk.strip(),
                        chunk_id=chunk_id,
                        document_id=document_id,
                        metadata=metadata
                    )
                    chunks.append(chunk)
            else:
                # Add the paragraph to the current chunk
                if current_chunk_text:
                    current_chunk_text += " " + paragraph
                else:
                    current_chunk_text = paragraph
                current_token_count += paragraph_tokens
                
                # Check if we've reached the target chunk size
                if current_token_count >= self.chunk_size:
                    chunk_id = str(uuid.uuid4())
                    chunk = Chunk(
                        content=current_chunk_text.strip(),
                        chunk_id=chunk_id,
                        document_id=document_id,
                        metadata=metadata
                    )
                    chunks.append(chunk)
                    
                    # Start a new chunk with overlap
                    if self.chunk_overlap > 0:
                        words = current_chunk_text.split()
                        current_chunk_text = " ".join(words[-min(self.chunk_overlap, len(words)):])
                        current_token_count = self._estimate_tokens(current_chunk_text)
                    else:
                        current_chunk_text = ""
                        current_token_count = 0
        
        # Add any remaining content as a final chunk
        if current_chunk_text and current_token_count >= self.min_chunk_size:
            chunk_id = str(uuid.uuid4())
            chunk = Chunk(
                content=current_chunk_text.strip(),
                chunk_id=chunk_id,
                document_id=document_id,
                metadata=metadata
            )
            chunks.append(chunk)
        
        return chunks
    
    def chunk(
        self, 
        content: str, 
        metadata: Optional[Dict[str, Any]] = None,
        document_id: Optional[str] = None
    ) -> List[Chunk]:
        """
        Split content into semantic chunks.
        
        Args:
            content: Content to be chunked
            metadata: Optional metadata to associate with chunks
            document_id: Optional document ID to associate with chunks
            
        Returns:
            chunks: List of Chunk objects
        """
        if not content.strip():
            return []
            
        # Identify paragraphs
        if self.respect_paragraphs:
            paragraphs = self._identify_paragraphs(content)
        else:
            # Treat the whole content as one paragraph
            paragraphs = [content]
        
        # Create chunks from paragraphs
        chunks = self._create_semantic_chunks(paragraphs, document_id, metadata)
        
        logger.info("Created %d chunks from content", len(chunks))
        return chunks