oncall-guide-ai / src /retrieval.py
YanBoChen
WIP: Remove obsolete files and implement cloud data loading for customization and retrieval systems
d603ef9
raw
history blame
16.8 kB
"""
Basic Retrieval System for OnCall.ai
This module implements the core vector retrieval functionality:
- Basic vector search
- Source marking
- Unified output format
"""
import numpy as np
import json
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class BasicRetrievalSystem:
"""Basic vector retrieval system for medical documents"""
def __init__(self, embedding_dim: int = 768):
"""
Initialize the retrieval system
Args:
embedding_dim: Dimension of embeddings (default: 768 for PubMedBERT)
"""
self.embedding_dim = embedding_dim
self.embedding_model = None
self.emergency_index = None
self.treatment_index = None
self.emergency_chunks = {}
self.treatment_chunks = {}
# Initialize system
self._initialize_system()
def _initialize_system(self) -> None:
"""Initialize embeddings, indices and chunks"""
try:
logger.info("Initializing retrieval system...")
# Initialize embedding model
self.embedding_model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
logger.info("Embedding model loaded successfully")
# Initialize Annoy indices
self.emergency_index = AnnoyIndex(self.embedding_dim, 'angular')
self.treatment_index = AnnoyIndex(self.embedding_dim, 'angular')
# Load data using cloud loader
from cloud_loader import cloud_loader
self._load_chunks_from_cloud()
self._load_embeddings_from_cloud()
self._build_or_load_indices_from_cloud()
logger.info("Retrieval system initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize retrieval system: {e}")
raise
def _load_chunks_from_cloud(self) -> None:
"""Load chunk data from cloud or local files"""
try:
from cloud_loader import cloud_loader
# Load emergency chunks
emergency_chunks_path = cloud_loader.get_model_file_path("models/embeddings/emergency_chunks.json")
with open(emergency_chunks_path, 'r', encoding='utf-8') as f:
emergency_data = json.load(f)
self.emergency_chunks = {i: chunk for i, chunk in enumerate(emergency_data)}
# Load treatment chunks
treatment_chunks_path = cloud_loader.get_model_file_path("models/embeddings/treatment_chunks.json")
with open(treatment_chunks_path, 'r', encoding='utf-8') as f:
treatment_data = json.load(f)
self.treatment_chunks = {i: chunk for i, chunk in enumerate(treatment_data)}
logger.info(f"Loaded {len(self.emergency_chunks)} emergency and {len(self.treatment_chunks)} treatment chunks")
except Exception as e:
logger.error(f"Failed to load chunks: {e}")
raise
def _load_embeddings_from_cloud(self) -> None:
"""Load embeddings from cloud or local files"""
try:
from cloud_loader import cloud_loader
# Load emergency embeddings
emergency_embeddings_path = cloud_loader.get_model_file_path("models/embeddings/emergency_embeddings.npy")
self.emergency_embeddings = np.load(emergency_embeddings_path)
# Load treatment embeddings
treatment_embeddings_path = cloud_loader.get_model_file_path("models/embeddings/treatment_embeddings.npy")
self.treatment_embeddings = np.load(treatment_embeddings_path)
logger.info("Embeddings loaded successfully")
except Exception as e:
logger.error(f"Failed to load embeddings: {e}")
raise
def _build_or_load_indices_from_cloud(self) -> None:
"""Build or load Annoy indices from cloud or local files"""
try:
from cloud_loader import cloud_loader
# Load emergency index
emergency_index_path = cloud_loader.get_model_file_path("models/indices/annoy/emergency.ann")
self.emergency_index.load(emergency_index_path)
# Load treatment index
treatment_index_path = cloud_loader.get_model_file_path("models/indices/annoy/treatment.ann")
self.treatment_index.load(treatment_index_path)
logger.info("Annoy indices loaded successfully")
except Exception as e:
logger.error(f"Failed to load indices: {e}")
raise
def _load_chunks(self, base_path: Path) -> None:
"""Load chunk data from JSON files"""
try:
# Load emergency chunks
with open(base_path / "embeddings" / "emergency_chunks.json", 'r') as f:
self.emergency_chunks = json.load(f)
# Load treatment chunks
with open(base_path / "embeddings" / "treatment_chunks.json", 'r') as f:
self.treatment_chunks = json.load(f)
logger.info("Chunks loaded successfully")
except FileNotFoundError as e:
logger.error(f"Chunk file not found: {e}")
raise
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in chunk file: {e}")
raise
def _load_embeddings(self, base_path: Path) -> None:
"""Load pre-computed embeddings"""
try:
# Load emergency embeddings
self.emergency_embeddings = np.load(
base_path / "embeddings" / "emergency_embeddings.npy"
)
# Load treatment embeddings
self.treatment_embeddings = np.load(
base_path / "embeddings" / "treatment_embeddings.npy"
)
logger.info("Embeddings loaded successfully")
except Exception as e:
logger.error(f"Failed to load embeddings: {e}")
raise
def _build_or_load_indices(self, base_path: Path) -> None:
"""Build or load Annoy indices"""
indices_path = base_path / "indices" / "annoy"
emergency_index_path = indices_path / "emergency.ann"
treatment_index_path = indices_path / "treatment.ann"
try:
# Emergency index
if emergency_index_path.exists():
self.emergency_index.load(str(emergency_index_path))
logger.info("Loaded existing emergency index")
else:
self._build_index(
self.emergency_embeddings,
self.emergency_index,
emergency_index_path
)
logger.info("Built new emergency index")
# Treatment index
if treatment_index_path.exists():
self.treatment_index.load(str(treatment_index_path))
logger.info("Loaded existing treatment index")
else:
self._build_index(
self.treatment_embeddings,
self.treatment_index,
treatment_index_path
)
logger.info("Built new treatment index")
except Exception as e:
logger.error(f"Failed to build/load indices: {e}")
raise
def _build_index(self, embeddings: np.ndarray, index: AnnoyIndex,
save_path: Path, n_trees: int = 15) -> None:
"""
Build and save Annoy index
Args:
embeddings: Embedding vectors
index: AnnoyIndex instance
save_path: Path to save the index
n_trees: Number of trees for Annoy index (default: 15)
"""
try:
for i, vec in enumerate(embeddings):
index.add_item(i, vec)
index.build(n_trees)
save_path.parent.mkdir(parents=True, exist_ok=True)
index.save(str(save_path))
except Exception as e:
logger.error(f"Failed to build index: {e}")
raise
def search(self, query: str, top_k: int = 5) -> Dict[str, Any]:
"""
Perform vector search on both indices
Args:
query: Search query
top_k: Number of results to return from each index
Returns:
Dict containing search results and metadata
"""
try:
# Get query embedding
query_embedding = self.embedding_model.encode([query])[0]
# Search both indices
emergency_results = self._search_index(
query_embedding,
self.emergency_index,
self.emergency_chunks,
"emergency",
top_k
)
treatment_results = self._search_index(
query_embedding,
self.treatment_index,
self.treatment_chunks,
"treatment",
top_k
)
# Log individual index results
logger.info(f"Search results: Emergency={len(emergency_results)}, Treatment={len(treatment_results)}")
results = {
"query": query,
"emergency_results": emergency_results,
"treatment_results": treatment_results,
"total_results": len(emergency_results) + len(treatment_results)
}
# Post-process results
processed_results = self.post_process_results(results)
return processed_results
except Exception as e:
logger.error(f"Search failed: {e}")
raise
def _search_index(self, query_embedding: np.ndarray, index: AnnoyIndex,
chunks: Dict, source_type: str, top_k: int) -> List[Dict]:
"""
Search a single index and format results
Args:
query_embedding: Query vector
index: AnnoyIndex to search
chunks: Chunk data
source_type: Type of source ("emergency" or "treatment")
top_k: Number of results to return
Returns:
List of formatted results
"""
# Get nearest neighbors
indices, distances = index.get_nns_by_vector(
query_embedding, top_k, include_distances=True
)
# Format results
results = []
for idx, distance in zip(indices, distances):
chunk_data = chunks[idx] # chunks is a list, use integer index directly
result = {
"type": source_type, # Using 'type' to match metadata
"chunk_id": idx,
"distance": distance,
"text": chunk_data.get("text", ""),
"matched": chunk_data.get("matched", ""),
"matched_treatment": chunk_data.get("matched_treatment", "")
}
results.append(result)
return results
def post_process_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""
Post-process search results
- Remove duplicates
- Sort by distance
- Add metadata enrichment
Args:
results: Raw search results
Returns:
Processed results
"""
try:
emergency_results = results["emergency_results"]
treatment_results = results["treatment_results"]
# Combine all results
all_results = emergency_results + treatment_results
# Remove duplicates based on exact text matching
unique_results = self._remove_duplicates(all_results)
# Sort by distance
sorted_results = sorted(unique_results, key=lambda x: x["distance"])
return {
"query": results["query"],
"processed_results": sorted_results,
"total_results": len(sorted_results),
"processing_info": {
"duplicates_removed": len(all_results) - len(unique_results)
}
}
except Exception as e:
logger.error(f"Post-processing failed: {e}")
raise
def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
"""
Remove duplicate results based on exact text matching
Args:
results: List of search results
Returns:
Deduplicated results with logging statistics
"""
original_count = len(results)
seen_texts = set()
unique_results = []
# Sort results by distance (ascending) to keep best matches
sorted_results = sorted(results, key=lambda x: x["distance"])
logger.info(f"Deduplication: Processing {original_count} results using text matching")
for result in sorted_results:
text = result["text"]
if text not in seen_texts:
seen_texts.add(text)
unique_results.append(result)
else:
logger.debug(f"Skipping duplicate text: {text[:50]}...")
final_count = len(unique_results)
logger.info(f"Deduplication summary: {original_count}{final_count} results (removed {original_count - final_count})")
return unique_results
def search_sliding_window_chunks(self, query: str, top_k: int = 5, window_size: int = 256, overlap: int = 64) -> List[Dict[str, Any]]:
"""
Perform semantic search using sliding window chunks
Args:
query: Search query
top_k: Number of top results to return
window_size: Size of sliding window chunks
overlap: Overlap between sliding windows
Returns:
List of search results with sliding window chunks
"""
try:
# Get query embedding
query_embedding = self.embedding_model.encode([query])[0]
# Combine emergency and treatment chunks
all_chunks = self.emergency_chunks + self.treatment_chunks
all_embeddings = np.vstack([self.emergency_embeddings, self.treatment_embeddings])
# Compute cosine similarities
similarities = [
np.dot(query_embedding, chunk_emb) /
(np.linalg.norm(query_embedding) * np.linalg.norm(chunk_emb))
for chunk_emb in all_embeddings
]
# Sort results by similarity
sorted_indices = np.argsort(similarities)[::-1]
# Prepare results
results = []
for idx in sorted_indices[:top_k]:
chunk = all_chunks[idx]
result = {
'text': chunk.get('text', ''),
'distance': similarities[idx],
'type': 'emergency' if idx < len(self.emergency_chunks) else 'treatment'
}
results.append(result)
logger.info(f"Sliding window search: Found {len(results)} results")
return results
except Exception as e:
logger.error(f"Sliding window search failed: {e}")
return []
def search_generic_medical_content(self, query: str, top_k: int = 5) -> List[Dict]:
"""
Perform generic medical content search
Args:
query: Search query
top_k: Number of top results to return
Returns:
List of search results
"""
try:
# re-use search_sliding_window_chunks method
return self.search_sliding_window_chunks(query, top_k=top_k)
except Exception as e:
logger.error(f"Generic medical content search error: {e}")
return []