Spaces:
Running
Running
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
def hybrid_split(text: str, max_len: int = 1024) -> list[str]: | |
""" | |
Split text into chunks respecting sentence boundaries when possible, | |
with optional overlap between chunks. | |
Args: | |
text: The text to split | |
max_len: Maximum length for each chunk | |
Returns: | |
List of text chunks | |
""" | |
# Normalize text | |
text = text.replace("\r", "").replace("\n", " ").strip() | |
# Extract sentences (more robust regex for sentence detection) | |
import re | |
sentences = re.split(r"(?<=[.!?])\s+", text) | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
if len(sentence) > max_len: | |
# First add the current chunk if it exists | |
chunks.append(sentence) | |
# Normal case - see if adding the sentence exceeds max_len | |
elif len(current_chunk) + len(sentence) + 1 > max_len: | |
# Add the current chunk and start a new one | |
chunks.append(current_chunk) | |
current_chunk = "" | |
else: | |
# Add to the current chunk | |
if current_chunk: | |
current_chunk += " " + sentence | |
else: | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk) | |
return chunks | |
def cosine_similarity(vec1, vec2): | |
"""Calculate the cosine similarity between two vectors.""" | |
dot_product = np.dot(vec1, vec2) | |
norm_vec1 = np.linalg.norm(vec1) | |
norm_vec2 = np.linalg.norm(vec2) | |
return dot_product / (norm_vec1 * norm_vec2) | |
def get_embedding(text): | |
"""Generate an embedding using SBERT.""" | |
return embedding_model.encode(text, convert_to_numpy=True) | |
def semantic_chunking(text, threshold=0.75, max_chunk_size=8191): | |
""" | |
Splits text into semantic chunks based on sentence similarity. | |
- threshold: Lower = more splits, Higher = fewer splits | |
- max_chunk_size: Maximum size of each chunk in characters | |
""" | |
text = text.replace("\n", " ").replace("\r", " ").strip() | |
sentences = hybrid_split(text) | |
embeddings = [get_embedding(sent) for sent in sentences] | |
chunks = [] | |
current_chunk = [sentences[0]] | |
for i in range(1, len(sentences)): | |
sim = cosine_similarity(embeddings[i - 1], embeddings[i]) | |
if ( | |
sim < threshold | |
or len(" ".join(current_chunk + [sentences[i]])) > max_chunk_size | |
): | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [sentences[i]] | |
else: | |
current_chunk.append(sentences[i]) | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks |