sagar008 commited on
Commit
18dd619
·
verified ·
1 Parent(s): 47c18ba

Create tagger.py

Browse files
Files changed (1) hide show
  1. tagger.py +29 -0
tagger.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics.pairwise import cosine_similarity
2
+
3
+ def clause_match_by_cosine(chunk_embeddings, clause_reference):
4
+ reference_embeddings = [clause['embedding'] for clause in clause_reference]
5
+ reference_labels = [clause['label'] for clause in clause_reference]
6
+ tagged_chunks = []
7
+
8
+ for i, chunk_data in enumerate(chunk_embeddings):
9
+ chunk_text = chunk_data['chunk']
10
+ chunk_embed = chunk_data['embedding']
11
+
12
+ similarities = cosine_similarity([chunk_embed], reference_embeddings)[0]
13
+ top_indices = similarities.argsort()[-3:][::-1]
14
+
15
+ top_matches = [
16
+ {
17
+ "label": reference_labels[idx],
18
+ "similarity": float(similarities[idx])
19
+ }
20
+ for idx in top_indices if similarities[idx] >= 0.75
21
+ ]
22
+
23
+ tagged_chunks.append({
24
+ "chunk_index": i,
25
+ "chunk_preview": chunk_text[:200],
26
+ "top_matches": top_matches
27
+ })
28
+
29
+ return tagged_chunks