Spaces:
Running
Running
import os | |
import logging | |
from typing import List, Optional | |
import chromadb | |
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Setup logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def search_similar_chunks( | |
query_text: str, | |
document_names: List[str], | |
collection_name: str = "rag_collection", | |
top_k: int = 5, | |
): | |
"""Search for top-k chunks similar to query_text within a specific document (source_file).""" | |
try: | |
# Initialize embedding function and Chroma client | |
embedding_function = OpenAIEmbeddingFunction( | |
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small" | |
) | |
client = chromadb.PersistentClient(path="./chroma_db") | |
# Load the collection | |
collection = client.get_collection( | |
name=collection_name, embedding_function=embedding_function | |
) | |
# Query similar documents filtered by document_name | |
results = collection.query( | |
query_texts=[query_text], | |
n_results=top_k, | |
where={"source_file": {"$in": document_names}}, | |
) | |
documents = results.get("documents", [[]])[0] | |
metadatas = results.get("metadatas", [[]])[0] | |
return documents | |
except Exception as e: | |
logger.error(f"Similarity search failed: {str(e)}") | |
return [] |