Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 10

Commit

137a4e7

verified ·

1 Parent(s): a5a31ff

Update clause_tagger.py

Browse files

Files changed (1) hide show

clause_tagger.py +51 -5

clause_tagger.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer
 import numpy as np
@@ -14,8 +15,16 @@ class ClauseTagger:
         """Initialize embedding model and load clause references"""
         if self.embedding_model is None:
             print("🧠 Loading embedding model for clause tagging...")
-            # Use a legal-domain model for better clause understanding
-            self.embedding_model = SentenceTransformer('law-ai/InLegalBERT')
             print("✅ Embedding model loaded")
         # Load clause references
@@ -53,7 +62,7 @@ class ClauseTagger:
         return clauses
     async def tag_clauses(self, chunks: List[str]) -> List[Dict[str, Any]]:
-        """Tag clauses in document chunks"""
         if not self.clause_reference:
             return []
@@ -75,7 +84,7 @@ class ClauseTagger:
                 )[0][0]
                 # Only include matches above threshold
-                if similarity > 0.7:  # Adjust threshold as needed
                     tagged_clauses.append({
                         'clause_id': clause['id'],
                         'clause_type': clause['type'],
@@ -88,4 +97,41 @@ class ClauseTagger:
         # Sort by similarity score and return top matches
         tagged_clauses.sort(key=lambda x: x['similarity_score'], reverse=True)
-        return tagged_clauses[:20]  # Return top 20 matches

+# clause_tagger.py
 from typing import List, Dict, Any
 from sentence_transformers import SentenceTransformer
 import numpy as np
         """Initialize embedding model and load clause references"""
         if self.embedding_model is None:
             print("🧠 Loading embedding model for clause tagging...")
+            # Set cache directory explicitly for HF Spaces
+            cache_folder = "/tmp/sentence_transformers_cache"
+            os.makedirs(cache_folder, exist_ok=True)
+            # Use a legal-domain model with explicit cache directory
+            self.embedding_model = SentenceTransformer(
+                'law-ai/InLegalBERT',
+                cache_folder=cache_folder
+            )
             print("✅ Embedding model loaded")
         # Load clause references
         return clauses
     async def tag_clauses(self, chunks: List[str]) -> List[Dict[str, Any]]:
+        """Tag clauses in document chunks - GENERATES NEW EMBEDDINGS"""
         if not self.clause_reference:
             return []
                 )[0][0]
                 # Only include matches above threshold
+                if similarity > 0.7:
                     tagged_clauses.append({
                         'clause_id': clause['id'],
                         'clause_type': clause['type'],
         # Sort by similarity score and return top matches
         tagged_clauses.sort(key=lambda x: x['similarity_score'], reverse=True)
+        return tagged_clauses[:20]
+    async def tag_clauses_with_embeddings(self, chunk_data: List[Dict]) -> List[Dict[str, Any]]:
+        """Tag clauses using pre-computed embeddings - OPTIMIZED VERSION"""
+        if not self.clause_reference:
+            return []
+        print(f"🏷️ Tagging clauses using pre-computed embeddings for {len(chunk_data)} chunks...")
+        tagged_clauses = []
+        for chunk_idx, chunk_info in enumerate(chunk_data):
+            chunk_embedding = chunk_info["embedding"]
+            if chunk_embedding is None:
+                continue
+            # Find best matching clauses using pre-computed embedding
+            for clause in self.clause_reference:
+                similarity = cosine_similarity(
+                    [chunk_embedding],
+                    [clause['embedding']]
+                )[0][0]
+                if similarity > 0.7:
+                    tagged_clauses.append({
+                        'clause_id': clause['id'],
+                        'clause_type': clause['type'],
+                        'clause_category': clause['category'],
+                        'matched_text': chunk_info["text"][:200] + '...' if len(chunk_info["text"]) > 200 else chunk_info["text"],
+                        'similarity_score': float(similarity),
+                        'chunk_index': chunk_idx,
+                        'reference_text': clause['text']
+                    })
+        tagged_clauses.sort(key=lambda x: x['similarity_score'], reverse=True)
+        return tagged_clauses[:6]