oncall-guide-ai / tests /test_embedding_and_index.py
YanBoChen
feat(data_processing): Implement token length control with semantic preservation
922ed80
raw
history blame
1.1 kB
import numpy as np
from annoy import AnnoyIndex
import pytest
from data_processing import DataProcessor
@pytest.fixture(scope="module")
def processor():
return DataProcessor(base_dir=".")
def test_embedding_dimensions(processor):
# load emergency embeddings
emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
expected_dim = processor.embedding_dim
assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
assert emb.shape[1] == expected_dim, (
f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
)
def test_annoy_search(processor):
# load embeddings
emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
# load Annoy index
idx = AnnoyIndex(processor.embedding_dim, 'angular')
idx.load(str(processor.models_dir / "indices" / "annoy" / "emergency_index.ann"))
# perform a sample query
query_vec = emb[0]
ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
assert len(ids) == 5
assert all(0 <= d <= 2 for d in distances)