Update vector_store.py
Browse files- vector_store.py +32 -34
vector_store.py
CHANGED
@@ -3,12 +3,11 @@
|
|
3 |
Vector store integration for legal document embeddings using InLegalBERT and Pinecone
|
4 |
"""
|
5 |
import os
|
6 |
-
import pinecone
|
7 |
-
from langchain_pinecone import PineconeVectorStore # ⭐ Modern import
|
8 |
-
from langchain.embeddings.base import Embeddings
|
9 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
import numpy as np
|
11 |
from typing import List, Dict, Any
|
|
|
|
|
|
|
12 |
|
13 |
class InLegalBERTEmbeddings(Embeddings):
|
14 |
"""Custom LangChain embeddings wrapper for InLegalBERT"""
|
@@ -61,31 +60,30 @@ class LegalDocumentVectorStore:
|
|
61 |
|
62 |
self._initialized = True
|
63 |
|
64 |
-
def
|
65 |
-
|
66 |
-
"""Save
|
67 |
try:
|
68 |
self._initialize_pinecone()
|
69 |
|
70 |
-
# Use
|
71 |
-
|
72 |
-
|
73 |
-
# Split document into chunks
|
74 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
75 |
-
chunk_size=1000,
|
76 |
-
chunk_overlap=200,
|
77 |
-
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
78 |
-
)
|
79 |
|
80 |
-
|
|
|
|
|
81 |
|
82 |
-
# Prepare metadata
|
83 |
metadatas = []
|
84 |
-
for i,
|
|
|
|
|
|
|
85 |
metadata = {
|
86 |
'document_id': document_id,
|
87 |
'chunk_index': i,
|
88 |
-
'total_chunks': len(
|
89 |
'source': 'legal_document',
|
90 |
'has_key_clauses': len(analysis_results.get('key_clauses', [])) > 0,
|
91 |
'risk_count': len(analysis_results.get('risky_terms', [])),
|
@@ -94,26 +92,25 @@ class LegalDocumentVectorStore:
|
|
94 |
}
|
95 |
metadatas.append(metadata)
|
96 |
|
97 |
-
#
|
98 |
index = self.pc.Index(self.index_name)
|
99 |
-
vectorstore = PineconeVectorStore(
|
100 |
-
index=index,
|
101 |
-
embedding=legal_embeddings,
|
102 |
-
text_key="text"
|
103 |
-
)
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
111 |
|
112 |
-
|
|
|
|
|
113 |
return True
|
114 |
|
115 |
except Exception as e:
|
116 |
-
print(f"❌ Error saving
|
117 |
return False
|
118 |
|
119 |
def get_retriever(self, clause_tagger, document_id: str = None):
|
@@ -143,3 +140,4 @@ class LegalDocumentVectorStore:
|
|
143 |
|
144 |
# Global instance
|
145 |
vector_store = LegalDocumentVectorStore()
|
|
|
|
3 |
Vector store integration for legal document embeddings using InLegalBERT and Pinecone
|
4 |
"""
|
5 |
import os
|
|
|
|
|
|
|
|
|
6 |
import numpy as np
|
7 |
from typing import List, Dict, Any
|
8 |
+
from langchain_pinecone import PineconeVectorStore
|
9 |
+
from langchain.embeddings.base import Embeddings
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
|
12 |
class InLegalBERTEmbeddings(Embeddings):
|
13 |
"""Custom LangChain embeddings wrapper for InLegalBERT"""
|
|
|
60 |
|
61 |
self._initialized = True
|
62 |
|
63 |
+
def save_document_embeddings_optimized(self, chunk_data: List[Dict], document_id: str,
|
64 |
+
analysis_results: Dict[str, Any]) -> bool:
|
65 |
+
"""Save embeddings using pre-computed vectors - NO RE-EMBEDDING"""
|
66 |
try:
|
67 |
self._initialize_pinecone()
|
68 |
|
69 |
+
# Use pre-computed embeddings instead of re-generating
|
70 |
+
texts = [chunk["text"] for chunk in chunk_data]
|
71 |
+
embeddings = [chunk["embedding"].tolist() for chunk in chunk_data if chunk["embedding"] is not None]
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
if not embeddings:
|
74 |
+
print("⚠️ No embeddings found in chunk_data")
|
75 |
+
return False
|
76 |
|
77 |
+
# Prepare metadata
|
78 |
metadatas = []
|
79 |
+
for i, chunk_info in enumerate(chunk_data):
|
80 |
+
if chunk_info["embedding"] is None:
|
81 |
+
continue
|
82 |
+
|
83 |
metadata = {
|
84 |
'document_id': document_id,
|
85 |
'chunk_index': i,
|
86 |
+
'total_chunks': len(chunk_data),
|
87 |
'source': 'legal_document',
|
88 |
'has_key_clauses': len(analysis_results.get('key_clauses', [])) > 0,
|
89 |
'risk_count': len(analysis_results.get('risky_terms', [])),
|
|
|
92 |
}
|
93 |
metadatas.append(metadata)
|
94 |
|
95 |
+
# Add to Pinecone using pre-computed embeddings
|
96 |
index = self.pc.Index(self.index_name)
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
vectors = [
|
99 |
+
{
|
100 |
+
"id": f"{document_id}_chunk_{i}",
|
101 |
+
"values": embedding,
|
102 |
+
"metadata": metadata
|
103 |
+
}
|
104 |
+
for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas))
|
105 |
+
]
|
106 |
|
107 |
+
index.upsert(vectors=vectors)
|
108 |
+
|
109 |
+
print(f"✅ Saved {len(vectors)} pre-computed embeddings to Pinecone")
|
110 |
return True
|
111 |
|
112 |
except Exception as e:
|
113 |
+
print(f"❌ Error saving pre-computed embeddings: {e}")
|
114 |
return False
|
115 |
|
116 |
def get_retriever(self, clause_tagger, document_id: str = None):
|
|
|
140 |
|
141 |
# Global instance
|
142 |
vector_store = LegalDocumentVectorStore()
|
143 |
+
|