Spaces:

PIXity
/

Pix-Agent-Test

Sleeping

App Files Files Community

Pix-Agent-Test / app /database /pinecone.py

Cuong2004

Chat engine

76f712e 3 months ago

raw

history blame contribute delete

22 kB

	import os
	from pinecone import Pinecone
	from dotenv import load_dotenv
	import logging
	from typing import Optional, List, Dict, Any, Union, Tuple
	import time
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	import google.generativeai as genai
	from langchain_core.retrievers import BaseRetriever
	from langchain.callbacks.manager import Callbacks
	from langchain_core.documents import Document
	from langchain_core.pydantic_v1 import Field

	# Configure logging
	logger = logging.getLogger(__name__)

	# Load environment variables
	load_dotenv()

	# Pinecone API key and index name
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
	PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

	# Pinecone retrieval configuration
	DEFAULT_LIMIT_K = int(os.getenv("PINECONE_DEFAULT_LIMIT_K", "10"))
	DEFAULT_TOP_K = int(os.getenv("PINECONE_DEFAULT_TOP_K", "6"))
	DEFAULT_SIMILARITY_METRIC = os.getenv("PINECONE_DEFAULT_SIMILARITY_METRIC", "cosine")
	DEFAULT_SIMILARITY_THRESHOLD = float(os.getenv("PINECONE_DEFAULT_SIMILARITY_THRESHOLD", "0.75"))
	ALLOWED_METRICS = os.getenv("PINECONE_ALLOWED_METRICS", "cosine,dotproduct,euclidean").split(",")

	# Export constants for importing elsewhere
	__all__ = [
	'get_pinecone_index',
	'check_db_connection',
	'search_vectors',
	'upsert_vectors',
	'delete_vectors',
	'fetch_metadata',
	'get_chain',
	'DEFAULT_TOP_K',
	'DEFAULT_LIMIT_K',
	'DEFAULT_SIMILARITY_METRIC',
	'DEFAULT_SIMILARITY_THRESHOLD',
	'ALLOWED_METRICS',
	'ThresholdRetriever'
	]

	# Configure Google API
	if GOOGLE_API_KEY:
	genai.configure(api_key=GOOGLE_API_KEY)

	# Initialize global variables to store instances of Pinecone and index
	pc = None
	index = None
	_retriever_instance = None

	# Check environment variables
	if not PINECONE_API_KEY:
	logger.error("PINECONE_API_KEY is not set in environment variables")

	if not PINECONE_INDEX_NAME:
	logger.error("PINECONE_INDEX_NAME is not set in environment variables")

	# Initialize Pinecone
	def init_pinecone():
	"""Initialize pinecone connection using new API"""
	global pc, index

	try:
	# Only initialize if not already initialized
	if pc is None:
	logger.info(f"Initializing Pinecone connection to index {PINECONE_INDEX_NAME}...")

	# Check if API key and index name are set
	if not PINECONE_API_KEY:
	logger.error("PINECONE_API_KEY is not set in environment variables")
	return None

	if not PINECONE_INDEX_NAME:
	logger.error("PINECONE_INDEX_NAME is not set in environment variables")
	return None

	# Initialize Pinecone client using the new API
	pc = Pinecone(api_key=PINECONE_API_KEY)

	try:
	# Check if index exists
	index_list = pc.list_indexes()

	if not hasattr(index_list, 'names') or PINECONE_INDEX_NAME not in index_list.names():
	logger.error(f"Index {PINECONE_INDEX_NAME} does not exist in Pinecone")
	return None

	# Get existing index
	index = pc.Index(PINECONE_INDEX_NAME)
	logger.info(f"Pinecone connection established to index {PINECONE_INDEX_NAME}")
	except Exception as connection_error:
	logger.error(f"Error connecting to Pinecone index: {connection_error}")
	return None

	return index
	except ImportError as e:
	logger.error(f"Required package for Pinecone is missing: {e}")
	return None
	except Exception as e:
	logger.error(f"Unexpected error initializing Pinecone: {e}")
	return None

	# Get Pinecone index singleton
	def get_pinecone_index():
	"""Get Pinecone index"""
	global index
	if index is None:
	index = init_pinecone()
	return index

	# Check Pinecone connection
	def check_db_connection():
	"""Check Pinecone connection"""
	try:
	pinecone_index = get_pinecone_index()
	if pinecone_index is None:
	return False

	# Check index information to confirm connection is working
	stats = pinecone_index.describe_index_stats()

	# Get total vector count from the new result structure
	total_vectors = stats.get('total_vector_count', 0)
	if hasattr(stats, 'namespaces'):
	# If there are namespaces, calculate total vector count from namespaces
	total_vectors = sum(ns.get('vector_count', 0) for ns in stats.namespaces.values())

	logger.info(f"Pinecone connection is working. Total vectors: {total_vectors}")
	return True
	except Exception as e:
	logger.error(f"Error in Pinecone connection: {e}")
	return False

	# Convert similarity score based on the metric
	def convert_score(score: float, metric: str) -> float:
	"""
	Convert similarity score to a 0-1 scale based on the metric used.
	For metrics like euclidean distance where lower is better, we invert the score.

	Args:
	score: The raw similarity score
	metric: The similarity metric used

	Returns:
	A normalized score between 0-1 where higher means more similar
	"""
	if metric.lower() in ["euclidean", "l2"]:
	# For distance metrics (lower is better), we inverse and normalize
	# Assuming max reasonable distance is 2.0 for normalized vectors
	return max(0, 1 - (score / 2.0))
	else:
	# For cosine and dot product (higher is better), return as is
	return score

	# Filter results based on similarity threshold
	def filter_by_threshold(results, threshold: float, metric: str) -> List[Dict]:
	"""
	Filter query results based on similarity threshold.

	Args:
	results: The query results from Pinecone
	threshold: The similarity threshold (0-1)
	metric: The similarity metric used

	Returns:
	Filtered list of matches
	"""
	filtered_matches = []

	if not hasattr(results, 'matches'):
	return filtered_matches

	for match in results.matches:
	# Get the score
	score = getattr(match, 'score', 0)

	# Convert score based on metric
	normalized_score = convert_score(score, metric)

	# Filter based on threshold
	if normalized_score >= threshold:
	# Add normalized score as an additional attribute
	match.normalized_score = normalized_score
	filtered_matches.append(match)

	return filtered_matches

	# Search vectors in Pinecone with advanced options
	async def search_vectors(
	query_vector,
	top_k: int = DEFAULT_TOP_K,
	limit_k: int = DEFAULT_LIMIT_K,
	similarity_metric: str = DEFAULT_SIMILARITY_METRIC,
	similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
	namespace: str = "Default",
	filter: Optional[Dict] = None
	) -> Dict:
	"""
	Search for most similar vectors in Pinecone with advanced filtering options.

	Args:
	query_vector: The query vector
	top_k: Number of results to return (after threshold filtering)
	limit_k: Maximum number of results to retrieve from Pinecone
	similarity_metric: Similarity metric to use (cosine, dotproduct, euclidean)
	similarity_threshold: Threshold for similarity (0-1)
	namespace: Namespace to search in
	filter: Filter query

	Returns:
	Search results with matches filtered by threshold
	"""
	try:
	# Validate parameters
	if similarity_metric not in ALLOWED_METRICS:
	logger.warning(f"Invalid similarity metric: {similarity_metric}. Using default: {DEFAULT_SIMILARITY_METRIC}")
	similarity_metric = DEFAULT_SIMILARITY_METRIC

	if limit_k < top_k:
	logger.warning(f"limit_k ({limit_k}) must be greater than or equal to top_k ({top_k}). Setting limit_k to {top_k}")
	limit_k = top_k

	# Perform search directly without cache
	pinecone_index = get_pinecone_index()
	if pinecone_index is None:
	logger.error("Failed to get Pinecone index for search")
	return None

	# Query Pinecone with the provided metric and higher limit_k to allow for threshold filtering
	results = pinecone_index.query(
	vector=query_vector,
	top_k=limit_k, # Retrieve more results than needed to allow for threshold filtering
	namespace=namespace,
	filter=filter,
	include_metadata=True,
	include_values=False, # No need to return vector values to save bandwidth
	metric=similarity_metric # Specify similarity metric
	)

	# Filter results by threshold
	filtered_matches = filter_by_threshold(results, similarity_threshold, similarity_metric)

	# Limit to top_k after filtering
	filtered_matches = filtered_matches[:top_k]

	# Create a new results object with filtered matches
	results.matches = filtered_matches

	# Log search result metrics
	match_count = len(filtered_matches)
	logger.info(f"Pinecone search returned {match_count} matches after threshold filtering (metric: {similarity_metric}, threshold: {similarity_threshold}, namespace: {namespace})")

	return results
	except Exception as e:
	logger.error(f"Error searching vectors: {e}")
	return None

	# Upsert vectors to Pinecone
	async def upsert_vectors(vectors, namespace="Default"):
	"""Upsert vectors to Pinecone index"""
	try:
	pinecone_index = get_pinecone_index()
	if pinecone_index is None:
	logger.error("Failed to get Pinecone index for upsert")
	return None

	response = pinecone_index.upsert(
	vectors=vectors,
	namespace=namespace
	)

	# Log upsert metrics
	upserted_count = response.get('upserted_count', 0)
	logger.info(f"Upserted {upserted_count} vectors to Pinecone")

	return response
	except Exception as e:
	logger.error(f"Error upserting vectors: {e}")
	return None

	# Delete vectors from Pinecone
	async def delete_vectors(ids, namespace="Default"):
	"""Delete vectors from Pinecone index"""
	try:
	pinecone_index = get_pinecone_index()
	if pinecone_index is None:
	logger.error("Failed to get Pinecone index for delete")
	return False

	response = pinecone_index.delete(
	ids=ids,
	namespace=namespace
	)

	logger.info(f"Deleted vectors with IDs {ids} from Pinecone")
	return True
	except Exception as e:
	logger.error(f"Error deleting vectors: {e}")
	return False

	# Fetch vector metadata from Pinecone
	async def fetch_metadata(ids, namespace="Default"):
	"""Fetch metadata for specific vector IDs"""
	try:
	pinecone_index = get_pinecone_index()
	if pinecone_index is None:
	logger.error("Failed to get Pinecone index for fetch")
	return None

	response = pinecone_index.fetch(
	ids=ids,
	namespace=namespace
	)

	return response
	except Exception as e:
	logger.error(f"Error fetching vector metadata: {e}")
	return None

	# Create a custom retriever class for Langchain integration
	class ThresholdRetriever(BaseRetriever):
	"""
	Custom retriever that supports threshold-based filtering and multiple similarity metrics.
	This integrates with the Langchain ecosystem while using our advanced retrieval logic.
	"""

	vectorstore: Any = Field(description="Vector store to use for retrieval")
	embeddings: Any = Field(description="Embeddings model to use for retrieval")
	search_kwargs: Dict[str, Any] = Field(default_factory=dict, description="Search kwargs for the vectorstore")
	top_k: int = Field(default=DEFAULT_TOP_K, description="Number of results to return after filtering")
	limit_k: int = Field(default=DEFAULT_LIMIT_K, description="Maximum number of results to retrieve from Pinecone")
	similarity_metric: str = Field(default=DEFAULT_SIMILARITY_METRIC, description="Similarity metric to use")
	similarity_threshold: float = Field(default=DEFAULT_SIMILARITY_THRESHOLD, description="Threshold for similarity")
	namespace: str = "Default"

	class Config:
	"""Configuration for this pydantic object."""
	arbitrary_types_allowed = True

	async def search_vectors_sync(
	self, query_vector,
	top_k: int = DEFAULT_TOP_K,
	limit_k: int = DEFAULT_LIMIT_K,
	similarity_metric: str = DEFAULT_SIMILARITY_METRIC,
	similarity_threshold: float = DEFAULT_SIMILARITY_THRESHOLD,
	namespace: str = "Default",
	filter: Optional[Dict] = None
	) -> Dict:
	"""Synchronous wrapper for search_vectors"""
	import asyncio
	try:
	# Get current event loop or create a new one
	try:
	loop = asyncio.get_event_loop()
	except RuntimeError:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)

	# Use event loop to run async function
	if loop.is_running():
	# If we're in an event loop, use asyncio.create_task
	task = asyncio.create_task(search_vectors(
	query_vector=query_vector,
	top_k=top_k,
	limit_k=limit_k,
	similarity_metric=similarity_metric,
	similarity_threshold=similarity_threshold,
	namespace=namespace,
	filter=filter
	))
	return await task
	else:
	# If not in an event loop, just await directly
	return await search_vectors(
	query_vector=query_vector,
	top_k=top_k,
	limit_k=limit_k,
	similarity_metric=similarity_metric,
	similarity_threshold=similarity_threshold,
	namespace=namespace,
	filter=filter
	)
	except Exception as e:
	logger.error(f"Error in search_vectors_sync: {e}")
	return None

	def _get_relevant_documents(
	self, query: str, *, run_manager: Callbacks = None
	) -> List[Document]:
	"""
	Get documents relevant to the query using threshold-based retrieval.

	Args:
	query: The query string
	run_manager: The callbacks manager

	Returns:
	List of relevant documents
	"""
	# Generate embedding for query using the embeddings model
	try:
	# Use the embeddings model we stored in the class
	embedding = self.embeddings.embed_query(query)
	except Exception as e:
	logger.error(f"Error generating embedding: {e}")
	# Fallback to creating a new embedding model if needed
	embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
	embedding = embedding_model.embed_query(query)

	# Perform search with advanced options - avoid asyncio.run()
	import asyncio

	# Get or create event loop
	try:
	loop = asyncio.get_event_loop()
	except RuntimeError:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)

	# Run asynchronous search in a safe way
	if loop.is_running():
	# We're inside an existing event loop (like in FastAPI)
	# Use a different approach - convert it to a synchronous call
	from concurrent.futures import ThreadPoolExecutor
	import functools

	# Define a wrapper function to run in a thread
	def run_async_in_thread():
	# Create a new event loop for this thread
	thread_loop = asyncio.new_event_loop()
	asyncio.set_event_loop(thread_loop)
	# Run the coroutine and return the result
	return thread_loop.run_until_complete(search_vectors(
	query_vector=embedding,
	top_k=self.top_k,
	limit_k=self.limit_k,
	similarity_metric=self.similarity_metric,
	similarity_threshold=self.similarity_threshold,
	namespace=self.namespace,
	# filter=self.search_kwargs.get("filter", None)
	))

	# Run the async function in a thread
	with ThreadPoolExecutor() as executor:
	search_result = executor.submit(run_async_in_thread).result()
	else:
	# No event loop running, we can use run_until_complete
	search_result = loop.run_until_complete(search_vectors(
	query_vector=embedding,
	top_k=self.top_k,
	limit_k=self.limit_k,
	similarity_metric=self.similarity_metric,
	similarity_threshold=self.similarity_threshold,
	namespace=self.namespace,
	# filter=self.search_kwargs.get("filter", None)
	))

	# Convert to documents
	documents = []
	if search_result and hasattr(search_result, 'matches'):
	for match in search_result.matches:
	# Extract metadata
	metadata = {}
	if hasattr(match, 'metadata'):
	metadata = match.metadata

	# Add score to metadata
	score = getattr(match, 'score', 0)
	normalized_score = getattr(match, 'normalized_score', score)
	metadata['score'] = score
	metadata['normalized_score'] = normalized_score

	# Extract text
	text = metadata.get('text', '')
	if 'text' in metadata:
	del metadata['text'] # Remove from metadata since it's the content

	# Create Document
	doc = Document(
	page_content=text,
	metadata=metadata
	)
	documents.append(doc)

	return documents

	# Get the retrieval chain with Pinecone vector store
	def get_chain(
	index_name=PINECONE_INDEX_NAME,
	namespace="Default",
	top_k=DEFAULT_TOP_K,
	limit_k=DEFAULT_LIMIT_K,
	similarity_metric=DEFAULT_SIMILARITY_METRIC,
	similarity_threshold=DEFAULT_SIMILARITY_THRESHOLD
	):
	"""
	Get the retrieval chain with Pinecone vector store using threshold-based retrieval.

	Args:
	index_name: Pinecone index name
	namespace: Pinecone namespace
	top_k: Number of results to return after filtering
	limit_k: Maximum number of results to retrieve from Pinecone
	similarity_metric: Similarity metric to use (cosine, dotproduct, euclidean)
	similarity_threshold: Threshold for similarity (0-1)

	Returns:
	ThresholdRetriever instance
	"""
	global _retriever_instance
	try:
	# If already initialized with same parameters, return cached instance
	if _retriever_instance is not None:
	return _retriever_instance

	start_time = time.time()
	logger.info("Initializing new retriever chain with threshold-based filtering")

	# Initialize embeddings model
	embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

	# Get index
	pinecone_index = get_pinecone_index()
	if not pinecone_index:
	logger.error("Failed to get Pinecone index for retriever chain")
	return None

	# Get statistics for logging
	try:
	stats = pinecone_index.describe_index_stats()
	total_vectors = stats.get('total_vector_count', 0)
	logger.info(f"Pinecone index stats - Total vectors: {total_vectors}")
	except Exception as e:
	logger.error(f"Error getting index stats: {e}")

	# Use Pinecone from langchain_community.vectorstores
	from langchain_community.vectorstores import Pinecone as LangchainPinecone

	logger.info(f"Creating Pinecone vectorstore with index: {index_name}, namespace: {namespace}")
	vectorstore = LangchainPinecone.from_existing_index(
	embedding=embeddings,
	index_name=index_name,
	namespace=namespace,
	text_key="text"
	)

	# Create threshold-based retriever
	logger.info(f"Creating ThresholdRetriever with top_k={top_k}, limit_k={limit_k}, " +
	f"metric={similarity_metric}, threshold={similarity_threshold}")

	# Create ThresholdRetriever with both vectorstore and embeddings
	_retriever_instance = ThresholdRetriever(
	vectorstore=vectorstore,
	embeddings=embeddings, # Pass embeddings separately
	top_k=top_k,
	limit_k=limit_k,
	similarity_metric=similarity_metric,
	similarity_threshold=similarity_threshold
	)

	logger.info(f"Pinecone retriever initialized in {time.time() - start_time:.2f} seconds")

	return _retriever_instance
	except Exception as e:
	logger.error(f"Error creating retrieval chain: {e}")
	return None