rag-medical / chroma_operations /delete_chroma.py
baderanas's picture
Upload 12 files
cdf244e verified
import os
import logging
from typing import Optional
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv
import shutil
# Load environment variables
load_dotenv()
# Setup logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def remove_from_chroma(
document_name: str, collection_name: str = "rag_collection"
) -> bool:
"""
Remove a document and its chunks from the Chroma vector database.
Args:
document_name (str): The base name of the document (without .pdf extension)
collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
Returns:
bool: True if successful, False otherwise
"""
try:
logger.info(f"Attempting to remove document '{document_name}' from Chroma DB")
# Check if Chroma DB exists
chroma_path = "./chroma_db"
if not os.path.exists(chroma_path):
logger.warning("Chroma DB directory does not exist")
return False
# Initialize embedding function and Chroma client
embedding_function = OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
)
# Connect to the persistent client
client = chromadb.PersistentClient(path=chroma_path)
# Get the collection
try:
collection = client.get_collection(
name=collection_name, embedding_function=embedding_function
)
except Exception as e:
logger.error(f"Collection '{collection_name}' not found: {e}")
return False
# Delete documents where source_file matches the document_name
try:
# First, get the IDs of chunks belonging to this document
results = collection.get(where={"source_file": document_name})
ids_to_delete = results.get("ids", [])
if not ids_to_delete:
logger.warning(
f"No chunks found for document '{document_name}' in collection"
)
return True # Nothing to delete, so consider it successful
# Delete chunks by IDs
collection.delete(ids=ids_to_delete)
logger.info(
f"Successfully deleted {len(ids_to_delete)} chunks for '{document_name}' from ChromaDB"
)
return True
except Exception as e:
logger.error(f"Error deleting chunks from collection: {e}")
return False
except Exception as e:
logger.error(f"Error removing document from Chroma DB: {e}")
return False
def delete_all_from_chroma(collection_name: str = "rag_collection") -> bool:
"""
Delete all documents from the specified ChromaDB collection.
Args:
collection_name (str): Name of the collection in ChromaDB (default: "rag_collection")
Returns:
bool: True if successful, False otherwise
"""
try:
# Initialize embedding function and Chroma client
embedding_function = OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
)
# Connect to the persistent client
client = chromadb.PersistentClient(path="./chroma_db")
try:
# Get the collection
collection = client.get_collection(
name=collection_name, embedding_function=embedding_function
)
# Delete all documents in the collection
collection.delete()
logger.info(
f"Successfully deleted all documents from collection '{collection_name}'"
)
return True
except Exception as e:
logger.error(f"Error accessing or deleting collection: {e}")
return False
except Exception as e:
logger.error(f"Error connecting to Chroma DB: {e}")
return False
def reset_chroma_db() -> bool:
"""
Reset the entire Chroma database by deleting and recreating the directory.
Returns:
bool: True if successful, False otherwise
"""
try:
chroma_path = "./chroma_db"
if os.path.exists(chroma_path):
# Delete the entire Chroma directory
shutil.rmtree(chroma_path)
logger.info("Successfully deleted entire Chroma DB directory")
# Create an empty directory
os.makedirs(chroma_path, exist_ok=True)
logger.info("Created fresh Chroma DB directory")
return True
else:
logger.warning("Chroma DB directory does not exist")
return False
except Exception as e:
logger.error(f"Error resetting Chroma DB: {e}")
return False