Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

File size: 5,353 Bytes

08df214
 
 
 
e96a966
 
 
a5a31ff
 
 
e96a966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08df214
e96a966
 
08df214
e96a966
 
 
 
 
 
 
 
08df214
 
ac89d45
e96a966
 
ac89d45
 
 
e96a966
 
ac89d45
 
e96a966
 
 
 
 
a5a31ff
 
 
e96a966
 
 
a5a31ff
 
 
e96a966
a5a31ff
 
 
e96a966
a5a31ff
e96a966
a5a31ff
 
 
 
e96a966
 
 
a5a31ff
e96a966
 
 
 
 
 
 
 
a5a31ff
ac89d45
e96a966
a5a31ff
 
 
 
 
 
 
 
e96a966
a5a31ff
 
 
e96a966
 
 
a5a31ff
e96a966
 
 
 
 
 
 
 
ac89d45
e96a966
08df214
e96a966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5a31ff

# vector_store.py
"""
Vector store integration for legal document embeddings using InLegalBERT and Pinecone
"""
import os
import numpy as np
from typing import List, Dict, Any
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

class InLegalBERTEmbeddings(Embeddings):
    """Custom LangChain embeddings wrapper for InLegalBERT"""
    
    def __init__(self, model):
        self.model = model
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents"""
        return self.model.encode(texts).tolist()
    
    def embed_query(self, text: str) -> List[float]:
        """Embed a single query"""
        return self.model.encode([text])[0].tolist()

class LegalDocumentVectorStore:
    """Manages vector storage for legal documents"""
    
    def __init__(self):
        self.index_name = 'legal-documents'
        self.dimension = 768  # InLegalBERT dimension
        self._initialized = False
        self.clause_tagger = None
        self.pc = None
    
    def _initialize_pinecone(self):
        """Initialize Pinecone connection"""
        if self._initialized:
            return
            
        PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
        
        if not PINECONE_API_KEY:
            raise ValueError("PINECONE_API_KEY environment variable not set")
        
        # Use modern Pinecone API
        from pinecone import Pinecone, ServerlessSpec
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        
        # Create index if doesn't exist
        existing_indexes = [index_info["name"] for index_info in self.pc.list_indexes()]
        if self.index_name not in existing_indexes:
            self.pc.create_index(
                name=self.index_name,
                dimension=self.dimension,
                metric='cosine',
                spec=ServerlessSpec(cloud='aws', region='us-east-1')
            )
            print(f"✅ Created Pinecone index: {self.index_name}")
        
        self._initialized = True
    
    def save_document_embeddings_optimized(self, chunk_data: List[Dict], document_id: str, 
                                         analysis_results: Dict[str, Any]) -> bool:
        """Save embeddings using pre-computed vectors - NO RE-EMBEDDING"""
        try:
            self._initialize_pinecone()
            
            # Use pre-computed embeddings instead of re-generating
            texts = [chunk["text"] for chunk in chunk_data]
            embeddings = [chunk["embedding"].tolist() for chunk in chunk_data if chunk["embedding"] is not None]
            
            if not embeddings:
                print("⚠️ No embeddings found in chunk_data")
                return False
            
            # Prepare metadata
            metadatas = []
            for i, chunk_info in enumerate(chunk_data):
                if chunk_info["embedding"] is None:
                    continue
                    
                metadata = {
                    'document_id': document_id,
                    'chunk_index': i,
                    'total_chunks': len(chunk_data),
                    'source': 'legal_document',
                    'has_key_clauses': len(analysis_results.get('key_clauses', [])) > 0,
                    'risk_count': len(analysis_results.get('risky_terms', [])),
                    'embedding_model': 'InLegalBERT',
                    'timestamp': str(np.datetime64('now'))
                }
                metadatas.append(metadata)
            
            # Add to Pinecone using pre-computed embeddings
            index = self.pc.Index(self.index_name)
            
            vectors = [
                {
                    "id": f"{document_id}_chunk_{i}",
                    "values": embedding,
                    "metadata": metadata
                }
                for i, (embedding, metadata) in enumerate(zip(embeddings, metadatas))
            ]
            
            index.upsert(vectors=vectors)
            
            print(f"✅ Saved {len(vectors)} pre-computed embeddings to Pinecone")
            return True
            
        except Exception as e:
            print(f"❌ Error saving pre-computed embeddings: {e}")
            return False
    
    def get_retriever(self, clause_tagger, document_id: str = None):
        """Get retriever for chat functionality"""
        try:
            self._initialize_pinecone()
            
            legal_embeddings = InLegalBERTEmbeddings(clause_tagger.embedding_model)
            index = self.pc.Index(self.index_name)
            
            vectorstore = PineconeVectorStore(
                index=index,
                embedding=legal_embeddings,
                text_key="text"
            )
            
            # Create retriever with optional document filtering
            search_kwargs = {'k': 5}
            if document_id:
                search_kwargs['filter'] = {'document_id': document_id}
            
            return vectorstore.as_retriever(search_kwargs=search_kwargs)
            
        except Exception as e:
            print(f"❌ Error creating retriever: {e}")
            return None

# Global instance
vector_store = LegalDocumentVectorStore()