Spaces:

Zwounds
/

LibraryRAG

Sleeping

App Files Files Community

Zwounds commited on Apr 1

Commit

01afcca

verified ·

1 Parent(s): d93b2e5

Upload app.py

Browse files

Files changed (1) hide show

app.py +27 -17

app.py CHANGED Viewed

@@ -12,8 +12,8 @@ from tqdm import tqdm
 from datasets import load_dataset
 import pandas as pd
 from sentence_transformers import SentenceTransformer
-# Import config if needed for EphemeralClient settings, though default might be fine
-import chromadb.config
 # --- Page Config (MUST BE FIRST Streamlit call) ---
 st.set_page_config(layout="wide")
@@ -25,7 +25,7 @@ LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
 HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
 HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
 PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
-ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
 TOP_K = 10
 INITIAL_N_RESULTS = 50
 MAX_NEW_TOKENS = 512
@@ -129,12 +129,18 @@ generation_client = initialize_hf_client()
 embedding_model = load_local_embedding_model()
 # ---
-# --- Setup ChromaDB Collection (using Session State) ---
-# This function now attempts to load or create the collection and stores it in session state
 def setup_chroma_collection():
     if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
-        logging.info("Using existing Chroma collection from session state.")
-        return st.session_state.chroma_collection
     # Proceed with setup only if essential components are loaded
     if not embedding_model or not generation_client:
@@ -147,17 +153,23 @@ def setup_chroma_collection():
             st.error("Failed to load embedding data. Cannot initialize vector database.")
             return None
         try:
-            logging.info("Initializing Ephemeral ChromaDB client...")
-            # Use EphemeralClient explicitly
-            chroma_client = chromadb.EphemeralClient(
-                settings=chromadb.config.Settings(
-                    anonymized_telemetry=False, # Optional: Disable telemetry
-                    allow_reset=True # Optional: Allows resetting
-                )
             )
-            # Check if collection exists and delete if it does (robustness)
             try:
                 existing_collections = [col.name for col in chroma_client.list_collections()]
                 if COLLECTION_NAME in existing_collections:
@@ -166,7 +178,6 @@ def setup_chroma_collection():
             except Exception as delete_e:
                  logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
             logging.info(f"Creating collection: {COLLECTION_NAME}")
             collection_instance = chroma_client.create_collection(
                 name=COLLECTION_NAME,
@@ -234,7 +245,6 @@ def setup_chroma_collection():
             return None
 # --- Initialize collection ---
-# Call the setup function which populates session state if needed
 collection = setup_chroma_collection()
 # ---

 from datasets import load_dataset
 import pandas as pd
 from sentence_transformers import SentenceTransformer
+import tempfile # Added for temporary directory
+import chromadb.config # Added for Settings
 # --- Page Config (MUST BE FIRST Streamlit call) ---
 st.set_page_config(layout="wide")
 HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
 HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
 PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
+ADD_BATCH_SIZE = 500 # Batch size for adding to Chroma
 TOP_K = 10
 INITIAL_N_RESULTS = 50
 MAX_NEW_TOKENS = 512
 embedding_model = load_local_embedding_model()
 # ---
+# --- Setup ChromaDB Collection (using Session State and Temp Dir) ---
 def setup_chroma_collection():
+    """Loads data from HF, sets up ChromaDB in a temp dir, populates it, and returns the collection."""
     if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
+        # Basic check: see if collection is queryable
+        try:
+            st.session_state.chroma_collection.peek(1) # Try a lightweight operation
+            logging.info("Using existing Chroma collection from session state.")
+            return st.session_state.chroma_collection
+        except Exception as e:
+            logging.warning(f"Error accessing existing collection in session state ({e}), re-initializing.")
+            st.session_state.chroma_collection = None # Force re-init
     # Proceed with setup only if essential components are loaded
     if not embedding_model or not generation_client:
             st.error("Failed to load embedding data. Cannot initialize vector database.")
             return None
+        # Create a temporary directory for this session
+        # Note: This directory might be cleaned up automatically depending on the OS/environment
+        # In HF Spaces ephemeral storage, it will likely be wiped on restart anyway.
+        temp_dir = tempfile.mkdtemp()
+        logging.info(f"Created temporary directory for ChromaDB: {temp_dir}")
         try:
+            logging.info("Initializing ChromaDB client with temporary storage...")
+            settings = chromadb.config.Settings(
+                persist_directory=temp_dir,
+                anonymized_telemetry=False,
+                is_persistent=True # Explicitly set for PersistentClient behavior in temp dir
             )
+            # Use the standard Client, but point it to the temp directory
+            chroma_client = chromadb.Client(settings=settings)
+            # Check if collection exists and delete if it does
             try:
                 existing_collections = [col.name for col in chroma_client.list_collections()]
                 if COLLECTION_NAME in existing_collections:
             except Exception as delete_e:
                  logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
             logging.info(f"Creating collection: {COLLECTION_NAME}")
             collection_instance = chroma_client.create_collection(
                 name=COLLECTION_NAME,
             return None
 # --- Initialize collection ---
 collection = setup_chroma_collection()
 # ---