baderanas's picture
Update chroma_operations/retrieve.py
deca715 verified
import os
import logging
from typing import List, Optional
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def search_similar_chunks(
query_text: str,
document_names: List[str],
collection_name: str = "rag_collection",
top_k: int = 5,
):
"""Search for top-k chunks similar to query_text within a specific document (source_file)."""
try:
# Initialize embedding function and Chroma client
embedding_function = OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
)
client = chromadb.PersistentClient(path="./chroma_db")
# Load the collection
collection = client.get_collection(
name=collection_name, embedding_function=embedding_function
)
# Query similar documents filtered by document_name
results = collection.query(
query_texts=[query_text],
n_results=top_k,
where={"source_file": {"$in": document_names}},
)
documents = results.get("documents", [[]])[0]
metadatas = results.get("metadatas", [[]])[0]
return documents
except Exception as e:
logger.error(f"Similarity search failed: {str(e)}")
return []