|
""" |
|
Base classes for context chunking components. |
|
""" |
|
|
|
from abc import ABC, abstractmethod |
|
from typing import List, Dict, Any, Optional |
|
|
|
class Chunk: |
|
"""Representation of a text chunk with metadata.""" |
|
|
|
def __init__( |
|
self, |
|
content: str, |
|
chunk_id: str, |
|
document_id: Optional[str] = None, |
|
metadata: Optional[Dict[str, Any]] = None, |
|
): |
|
""" |
|
Initialize a chunk. |
|
|
|
Args: |
|
content: The text content of the chunk |
|
chunk_id: Unique identifier for the chunk |
|
document_id: Optional ID of the source document |
|
metadata: Optional metadata for the chunk |
|
""" |
|
self.content = content |
|
self.chunk_id = chunk_id |
|
self.document_id = document_id |
|
self.metadata = metadata or {} |
|
self.embedding = None |
|
|
|
class BaseChunker(ABC): |
|
"""Base class for content chunking components.""" |
|
|
|
@abstractmethod |
|
def chunk( |
|
self, |
|
content: str, |
|
metadata: Optional[Dict[str, Any]] = None, |
|
document_id: Optional[str] = None |
|
) -> List[Chunk]: |
|
""" |
|
Split content into chunks. |
|
|
|
Args: |
|
content: Content to be chunked |
|
metadata: Optional metadata to associate with chunks |
|
document_id: Optional document ID to associate with chunks |
|
|
|
Returns: |
|
chunks: List of Chunk objects |
|
""" |
|
pass |
|
|