File size: 2,851 Bytes
cdf244e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


def hybrid_split(text: str, max_len: int = 1024) -> list[str]:
    """

    Split text into chunks respecting sentence boundaries when possible,

    with optional overlap between chunks.



    Args:

        text: The text to split

        max_len: Maximum length for each chunk



    Returns:

        List of text chunks

    """
    # Normalize text
    text = text.replace("\r", "").replace("\n", " ").strip()

    # Extract sentences (more robust regex for sentence detection)
    import re

    sentences = re.split(r"(?<=[.!?])\s+", text)

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(sentence) > max_len:
            # First add the current chunk if it exists
            chunks.append(sentence)

        # Normal case - see if adding the sentence exceeds max_len
        elif len(current_chunk) + len(sentence) + 1 > max_len:
            # Add the current chunk and start a new one
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            # Add to the current chunk
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk)

    return chunks


def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


def get_embedding(text):
    """Generate an embedding using SBERT."""
    return embedding_model.encode(text, convert_to_numpy=True)


def semantic_chunking(text, threshold=0.75, max_chunk_size=8191):
    """

    Splits text into semantic chunks based on sentence similarity.

    - threshold: Lower = more splits, Higher = fewer splits

    - max_chunk_size: Maximum size of each chunk in characters

    """
    text = text.replace("\n", " ").replace("\r", " ").strip()
    sentences = hybrid_split(text)
    embeddings = [get_embedding(sent) for sent in sentences]

    chunks = []
    current_chunk = [sentences[0]]

    for i in range(1, len(sentences)):
        sim = cosine_similarity(embeddings[i - 1], embeddings[i])
        if (
            sim < threshold
            or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size
        ):
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks