Spaces:

TeamSAS
/

UB_VSA

Running

App Files Files Community

AUMREDKA commited on 26 days ago

Commit

999388b

verified ·

1 Parent(s): 3673d92

Update buffalo_rag/vector_store/db.py

Browse files

Files changed (1) hide show

buffalo_rag/vector_store/db.py +4 -53

buffalo_rag/vector_store/db.py CHANGED Viewed

@@ -17,21 +17,15 @@ class VectorStore:
         self.chunk_ids = []
         self.chunks = {}
-        # Load embedding model
         self.model = SentenceTransformer(model_name)
-        # Load reranker model
         self.reranker = CrossEncoder(reranker_name)
-        # Load or create index
         self.load_or_create_index()
     def load_or_create_index(self) -> None:
-        """Load existing index or create a new one."""
         index_path = os.path.join(self.embedding_dir, 'faiss_index.pkl')
         if os.path.exists(index_path):
-            # Load existing index
             with open(index_path, 'rb') as f:
                 data = pickle.load(f)
                 self.index = data['index']
@@ -39,7 +33,6 @@ class VectorStore:
                 self.chunks = data['chunks']
             print(f"Loaded existing index with {len(self.chunk_ids)} chunks")
         else:
-            # Create new index
             embeddings_path = os.path.join(self.embedding_dir, 'embeddings.pkl')
             if os.path.exists(embeddings_path):
                 self.create_index()
@@ -53,22 +46,18 @@ class VectorStore:
         with open(embeddings_path, 'rb') as f:
             embedding_map = pickle.load(f)
-        # Extract embeddings and chunk IDs
         chunk_ids = list(embedding_map.keys())
         embeddings = np.array([embedding_map[chunk_id]['embedding'] for chunk_id in chunk_ids])
         chunks = {chunk_id: embedding_map[chunk_id]['chunk'] for chunk_id in chunk_ids}
-        # Create FAISS index
         dimension = embeddings.shape[1]
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings.astype(np.float32))
-        # Save index and metadata
         self.index = index
         self.chunk_ids = chunk_ids
         self.chunks = chunks
-        # Save to disk
         with open(os.path.join(self.embedding_dir, 'faiss_index.pkl'), 'wb') as f:
             pickle.dump({
                 'index': index,
@@ -83,24 +72,20 @@ class VectorStore:
               k: int = 5,
               filter_categories: Optional[List[str]] = None,
               rerank: bool = True) -> List[Dict[str, Any]]:
-        """Search for relevant chunks."""
         if self.index is None:
             print("No index available. Please create an index first.")
             return []
-        # Create query embedding
         query_embedding = self.model.encode([query])[0]
-        # Search index
         D, I = self.index.search(np.array([query_embedding]).astype(np.float32), min(k * 2, len(self.chunk_ids)))
-        # Get results
         results = []
         for i, idx in enumerate(I[0]):
             chunk_id = self.chunk_ids[idx]
             chunk = self.chunks[chunk_id]
-            # Apply category filter if specified
             if filter_categories and not any(cat in chunk.get('categories', []) for cat in filter_categories):
                 continue
@@ -111,22 +96,16 @@ class VectorStore:
             }
             results.append(result)
-        # Rerank results if requested
         if rerank and results:
-            # Prepare pairs for reranking
             pairs = [(query, result['chunk']['content']) for result in results]
-            # Get reranking scores
             rerank_scores = self.reranker.predict(pairs)
-            # Update scores and sort
             for i, score in enumerate(rerank_scores):
                 results[i]['rerank_score'] = float(score)
-            # Sort by rerank score
             results = sorted(results, key=lambda x: x['rerank_score'], reverse=True)
-            # Limit to k results
             results = results[:k]
         return results
@@ -135,29 +114,22 @@ class VectorStore:
                      query: str,
                      k: int = 5,
                      filter_categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
-        """Combine dense vector search with BM25-style keyword matching."""
-        # Get vector search results
         vector_results = self.search(query, k=k, filter_categories=filter_categories, rerank=False)
-        # Simple keyword matching (simulating BM25)
         keywords = query.lower().split()
-        # Score all chunks by keyword presence
         keyword_scores = {}
         for chunk_id, chunk_data in self.chunks.items():
             chunk = chunk_data
             content = (chunk['title'] + " " + chunk['content']).lower()
-            # Count keyword matches
             score = sum(content.count(keyword) for keyword in keywords)
-            # Apply category filter if specified
             if filter_categories and not any(cat in chunk.get('categories', []) for cat in filter_categories):
                 continue
             keyword_scores[chunk_id] = score
-        # Get top keyword matches
         keyword_results = sorted(
             [{'chunk_id': chunk_id, 'score': score, 'chunk': self.chunks[chunk_id]}
              for chunk_id, score in keyword_scores.items() if score > 0],
@@ -165,49 +137,28 @@ class VectorStore:
             reverse=True
         )[:k]
-        # Combine results (remove duplicates)
         seen_ids = set()
         combined_results = []
-        # Add vector results first
         for result in vector_results:
             combined_results.append(result)
             seen_ids.add(result['chunk_id'])
-        # Add keyword results if not already added
         for result in keyword_results:
             if result['chunk_id'] not in seen_ids:
                 combined_results.append(result)
                 seen_ids.add(result['chunk_id'])
-        # Limit to k results
         combined_results = combined_results[:k]
-        # Rerank final results
         if combined_results:
-            # Prepare pairs for reranking
             pairs = [(query, result['chunk']['content']) for result in combined_results]
-            # Get reranking scores
             rerank_scores = self.reranker.predict(pairs)
-            # Update scores and sort
             for i, score in enumerate(rerank_scores):
                 combined_results[i]['rerank_score'] = float(score)
-            # Sort by rerank score
             combined_results = sorted(combined_results, key=lambda x: x['rerank_score'], reverse=True)
-        return combined_results
-# Example usage
-if __name__ == "__main__":
-    vector_store = VectorStore()
-    results = vector_store.hybrid_search("How do I apply for OPT?")
-    print(f"Found {len(results)} results")
-    for i, result in enumerate(results[:3]):
-        print(f"Result {i+1}: {result['chunk']['title']}")
-        print(f"Score: {result.get('rerank_score', result['score'])}")
-        print(f"Content: {result['chunk']['content'][:100]}...")
-        print()

         self.chunk_ids = []
         self.chunks = {}
         self.model = SentenceTransformer(model_name)
         self.reranker = CrossEncoder(reranker_name)
         self.load_or_create_index()
     def load_or_create_index(self) -> None:
         index_path = os.path.join(self.embedding_dir, 'faiss_index.pkl')
         if os.path.exists(index_path):
             with open(index_path, 'rb') as f:
                 data = pickle.load(f)
                 self.index = data['index']
                 self.chunks = data['chunks']
             print(f"Loaded existing index with {len(self.chunk_ids)} chunks")
         else:
             embeddings_path = os.path.join(self.embedding_dir, 'embeddings.pkl')
             if os.path.exists(embeddings_path):
                 self.create_index()
         with open(embeddings_path, 'rb') as f:
             embedding_map = pickle.load(f)
         chunk_ids = list(embedding_map.keys())
         embeddings = np.array([embedding_map[chunk_id]['embedding'] for chunk_id in chunk_ids])
         chunks = {chunk_id: embedding_map[chunk_id]['chunk'] for chunk_id in chunk_ids}
         dimension = embeddings.shape[1]
         index = faiss.IndexFlatL2(dimension)
         index.add(embeddings.astype(np.float32))
         self.index = index
         self.chunk_ids = chunk_ids
         self.chunks = chunks
         with open(os.path.join(self.embedding_dir, 'faiss_index.pkl'), 'wb') as f:
             pickle.dump({
                 'index': index,
               k: int = 5,
               filter_categories: Optional[List[str]] = None,
               rerank: bool = True) -> List[Dict[str, Any]]:
         if self.index is None:
             print("No index available. Please create an index first.")
             return []
         query_embedding = self.model.encode([query])[0]
         D, I = self.index.search(np.array([query_embedding]).astype(np.float32), min(k * 2, len(self.chunk_ids)))
         results = []
         for i, idx in enumerate(I[0]):
             chunk_id = self.chunk_ids[idx]
             chunk = self.chunks[chunk_id]
             if filter_categories and not any(cat in chunk.get('categories', []) for cat in filter_categories):
                 continue
             }
             results.append(result)
         if rerank and results:
             pairs = [(query, result['chunk']['content']) for result in results]
             rerank_scores = self.reranker.predict(pairs)
             for i, score in enumerate(rerank_scores):
                 results[i]['rerank_score'] = float(score)
             results = sorted(results, key=lambda x: x['rerank_score'], reverse=True)
             results = results[:k]
         return results
                      query: str,
                      k: int = 5,
                      filter_categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
         vector_results = self.search(query, k=k, filter_categories=filter_categories, rerank=False)
         keywords = query.lower().split()
         keyword_scores = {}
         for chunk_id, chunk_data in self.chunks.items():
             chunk = chunk_data
             content = (chunk['title'] + " " + chunk['content']).lower()
             score = sum(content.count(keyword) for keyword in keywords)
             if filter_categories and not any(cat in chunk.get('categories', []) for cat in filter_categories):
                 continue
             keyword_scores[chunk_id] = score
         keyword_results = sorted(
             [{'chunk_id': chunk_id, 'score': score, 'chunk': self.chunks[chunk_id]}
              for chunk_id, score in keyword_scores.items() if score > 0],
             reverse=True
         )[:k]
         seen_ids = set()
         combined_results = []
         for result in vector_results:
             combined_results.append(result)
             seen_ids.add(result['chunk_id'])
         for result in keyword_results:
             if result['chunk_id'] not in seen_ids:
                 combined_results.append(result)
                 seen_ids.add(result['chunk_id'])
         combined_results = combined_results[:k]
         if combined_results:
             pairs = [(query, result['chunk']['content']) for result in combined_results]
             rerank_scores = self.reranker.predict(pairs)
             for i, score in enumerate(rerank_scores):
                 combined_results[i]['rerank_score'] = float(score)
             combined_results = sorted(combined_results, key=lambda x: x['rerank_score'], reverse=True)
+        return combined_results