sagar008 commited on
Commit
a5a31ff
·
verified ·
1 Parent(s): 96b5525

Update vector_store.py

Browse files
Files changed (1) hide show
  1. vector_store.py +32 -34
vector_store.py CHANGED
@@ -3,12 +3,11 @@
3
  Vector store integration for legal document embeddings using InLegalBERT and Pinecone
4
  """
5
  import os
6
- import pinecone
7
- from langchain_pinecone import PineconeVectorStore # ⭐ Modern import
8
- from langchain.embeddings.base import Embeddings
9
- from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  import numpy as np
11
  from typing import List, Dict, Any
 
 
 
12
 
13
  class InLegalBERTEmbeddings(Embeddings):
14
  """Custom LangChain embeddings wrapper for InLegalBERT"""
@@ -61,31 +60,30 @@ class LegalDocumentVectorStore:
61
 
62
  self._initialized = True
63
 
64
- def save_document_embeddings(self, document_text: str, document_id: str,
65
- analysis_results: Dict[str, Any], clause_tagger) -> bool:
66
- """Save document embeddings using InLegalBERT model"""
67
  try:
68
  self._initialize_pinecone()
69
 
70
- # Use the clause tagger's InLegalBERT model
71
- legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
72
-
73
- # Split document into chunks
74
- text_splitter = RecursiveCharacterTextSplitter(
75
- chunk_size=1000,
76
- chunk_overlap=200,
77
- separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
78
- )
79
 
80
- chunks = text_splitter.split_text(document_text)
 
 
81
 
82
- # Prepare metadata with analysis results
83
  metadatas = []
84
- for i, chunk in enumerate(chunks):
 
 
 
85
  metadata = {
86
  'document_id': document_id,
87
  'chunk_index': i,
88
- 'total_chunks': len(chunks),
89
  'source': 'legal_document',
90
  'has_key_clauses': len(analysis_results.get('key_clauses', [])) > 0,
91
  'risk_count': len(analysis_results.get('risky_terms', [])),
@@ -94,26 +92,25 @@ class LegalDocumentVectorStore:
94
  }
95
  metadatas.append(metadata)
96
 
97
- # Create vector store using modern API
98
  index = self.pc.Index(self.index_name)
99
- vectorstore = PineconeVectorStore(
100
- index=index,
101
- embedding=legal_embeddings,
102
- text_key="text"
103
- )
104
 
105
- # Add documents to Pinecone
106
- vectorstore.add_texts(
107
- texts=chunks,
108
- metadatas=metadatas,
109
- ids=[f"{document_id}_chunk_{i}" for i in range(len(chunks))]
110
- )
 
 
111
 
112
- print(f"✅ Saved {len(chunks)} chunks using InLegalBERT embeddings for document {document_id}")
 
 
113
  return True
114
 
115
  except Exception as e:
116
- print(f"❌ Error saving to Pinecone: {e}")
117
  return False
118
 
119
  def get_retriever(self, clause_tagger, document_id: str = None):
@@ -143,3 +140,4 @@ class LegalDocumentVectorStore:
143
 
144
  # Global instance
145
  vector_store = LegalDocumentVectorStore()
 
 
3
  Vector store integration for legal document embeddings using InLegalBERT and Pinecone
4
  """
5
  import os
 
 
 
 
6
  import numpy as np
7
  from typing import List, Dict, Any
8
+ from langchain_pinecone import PineconeVectorStore
9
+ from langchain.embeddings.base import Embeddings
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
 
12
  class InLegalBERTEmbeddings(Embeddings):
13
  """Custom LangChain embeddings wrapper for InLegalBERT"""
 
60
 
61
  self._initialized = True
62
 
63
+ def save_document_embeddings_optimized(self, chunk_data: List[Dict], document_id: str,
64
+ analysis_results: Dict[str, Any]) -> bool:
65
+ """Save embeddings using pre-computed vectors - NO RE-EMBEDDING"""
66
  try:
67
  self._initialize_pinecone()
68
 
69
+ # Use pre-computed embeddings instead of re-generating
70
+ texts = [chunk["text"] for chunk in chunk_data]
71
+ embeddings = [chunk["embedding"].tolist() for chunk in chunk_data if chunk["embedding"] is not None]
 
 
 
 
 
 
72
 
73
+ if not embeddings:
74
+ print("⚠️ No embeddings found in chunk_data")
75
+ return False
76
 
77
+ # Prepare metadata
78
  metadatas = []
79
+ for i, chunk_info in enumerate(chunk_data):
80
+ if chunk_info["embedding"] is None:
81
+ continue
82
+
83
  metadata = {
84
  'document_id': document_id,
85
  'chunk_index': i,
86
+ 'total_chunks': len(chunk_data),
87
  'source': 'legal_document',
88
  'has_key_clauses': len(analysis_results.get('key_clauses', [])) > 0,
89
  'risk_count': len(analysis_results.get('risky_terms', [])),
 
92
  }
93
  metadatas.append(metadata)
94
 
95
+ # Add to Pinecone using pre-computed embeddings
96
  index = self.pc.Index(self.index_name)
 
 
 
 
 
97
 
98
+ vectors = [
99
+ {
100
+ "id": f"{document_id}_chunk_{i}",
101
+ "values": embedding,
102
+ "metadata": metadata
103
+ }
104
+ for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas))
105
+ ]
106
 
107
+ index.upsert(vectors=vectors)
108
+
109
+ print(f"✅ Saved {len(vectors)} pre-computed embeddings to Pinecone")
110
  return True
111
 
112
  except Exception as e:
113
+ print(f"❌ Error saving pre-computed embeddings: {e}")
114
  return False
115
 
116
  def get_retriever(self, clause_tagger, document_id: str = None):
 
140
 
141
  # Global instance
142
  vector_store = LegalDocumentVectorStore()
143
+