Spaces:
Sleeping
Sleeping
from sklearn.metrics.pairwise import cosine_similarity | |
def clause_match_by_cosine(chunk_embeddings, clause_reference): | |
reference_embeddings = [clause['embedding'] for clause in clause_reference] | |
reference_labels = [clause['label'] for clause in clause_reference] | |
tagged_chunks = [] | |
for i, chunk_data in enumerate(chunk_embeddings): | |
chunk_text = chunk_data['chunk'] | |
chunk_embed = chunk_data['embedding'] | |
similarities = cosine_similarity([chunk_embed], reference_embeddings)[0] | |
top_indices = similarities.argsort()[-3:][::-1] | |
top_matches = [ | |
{ | |
"label": reference_labels[idx], | |
"similarity": float(similarities[idx]) | |
} | |
for idx in top_indices if similarities[idx] >= 0.75 | |
] | |
tagged_chunks.append({ | |
"chunk_index": i, | |
"chunk_preview": chunk_text[:200], | |
"top_matches": top_matches | |
}) | |
return tagged_chunks |