File size: 989 Bytes
18dd619
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from sklearn.metrics.pairwise import cosine_similarity

def clause_match_by_cosine(chunk_embeddings, clause_reference):
    reference_embeddings = [clause['embedding'] for clause in clause_reference]
    reference_labels = [clause['label'] for clause in clause_reference]
    tagged_chunks = []

    for i, chunk_data in enumerate(chunk_embeddings):
        chunk_text = chunk_data['chunk']
        chunk_embed = chunk_data['embedding']

        similarities = cosine_similarity([chunk_embed], reference_embeddings)[0]
        top_indices = similarities.argsort()[-3:][::-1]

        top_matches = [
            {
                "label": reference_labels[idx],
                "similarity": float(similarities[idx])
            }
            for idx in top_indices if similarities[idx] >= 0.75
        ]

        tagged_chunks.append({
            "chunk_index": i,
            "chunk_preview": chunk_text[:200],
            "top_matches": top_matches
        })

    return tagged_chunks