Spaces:
Running
Running
import os | |
import logging | |
from typing import Optional | |
import chromadb | |
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction | |
from dotenv import load_dotenv | |
import shutil | |
# Load environment variables | |
load_dotenv() | |
# Setup logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
def remove_from_chroma( | |
document_name: str, collection_name: str = "rag_collection" | |
) -> bool: | |
""" | |
Remove a document and its chunks from the Chroma vector database. | |
Args: | |
document_name (str): The base name of the document (without .pdf extension) | |
collection_name (str): Name of the collection in ChromaDB (default: "rag_collection") | |
Returns: | |
bool: True if successful, False otherwise | |
""" | |
try: | |
logger.info(f"Attempting to remove document '{document_name}' from Chroma DB") | |
# Check if Chroma DB exists | |
chroma_path = "./chroma_db" | |
if not os.path.exists(chroma_path): | |
logger.warning("Chroma DB directory does not exist") | |
return False | |
# Initialize embedding function and Chroma client | |
embedding_function = OpenAIEmbeddingFunction( | |
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small" | |
) | |
# Connect to the persistent client | |
client = chromadb.PersistentClient(path=chroma_path) | |
# Get the collection | |
try: | |
collection = client.get_collection( | |
name=collection_name, embedding_function=embedding_function | |
) | |
except Exception as e: | |
logger.error(f"Collection '{collection_name}' not found: {e}") | |
return False | |
# Delete documents where source_file matches the document_name | |
try: | |
# First, get the IDs of chunks belonging to this document | |
results = collection.get(where={"source_file": document_name}) | |
ids_to_delete = results.get("ids", []) | |
if not ids_to_delete: | |
logger.warning( | |
f"No chunks found for document '{document_name}' in collection" | |
) | |
return True # Nothing to delete, so consider it successful | |
# Delete chunks by IDs | |
collection.delete(ids=ids_to_delete) | |
logger.info( | |
f"Successfully deleted {len(ids_to_delete)} chunks for '{document_name}' from ChromaDB" | |
) | |
return True | |
except Exception as e: | |
logger.error(f"Error deleting chunks from collection: {e}") | |
return False | |
except Exception as e: | |
logger.error(f"Error removing document from Chroma DB: {e}") | |
return False | |
def delete_all_from_chroma(collection_name: str = "rag_collection") -> bool: | |
""" | |
Delete all documents from the specified ChromaDB collection. | |
Args: | |
collection_name (str): Name of the collection in ChromaDB (default: "rag_collection") | |
Returns: | |
bool: True if successful, False otherwise | |
""" | |
try: | |
# Initialize embedding function and Chroma client | |
embedding_function = OpenAIEmbeddingFunction( | |
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small" | |
) | |
# Connect to the persistent client | |
client = chromadb.PersistentClient(path="./chroma_db") | |
try: | |
# Get the collection | |
collection = client.get_collection( | |
name=collection_name, embedding_function=embedding_function | |
) | |
# Delete all documents in the collection | |
collection.delete() | |
logger.info( | |
f"Successfully deleted all documents from collection '{collection_name}'" | |
) | |
return True | |
except Exception as e: | |
logger.error(f"Error accessing or deleting collection: {e}") | |
return False | |
except Exception as e: | |
logger.error(f"Error connecting to Chroma DB: {e}") | |
return False | |
def reset_chroma_db() -> bool: | |
""" | |
Reset the entire Chroma database by deleting and recreating the directory. | |
Returns: | |
bool: True if successful, False otherwise | |
""" | |
try: | |
chroma_path = "./chroma_db" | |
if os.path.exists(chroma_path): | |
# Delete the entire Chroma directory | |
shutil.rmtree(chroma_path) | |
logger.info("Successfully deleted entire Chroma DB directory") | |
# Create an empty directory | |
os.makedirs(chroma_path, exist_ok=True) | |
logger.info("Created fresh Chroma DB directory") | |
return True | |
else: | |
logger.warning("Chroma DB directory does not exist") | |
return False | |
except Exception as e: | |
logger.error(f"Error resetting Chroma DB: {e}") | |
return False | |