Zwounds commited on
Commit
01afcca
·
verified ·
1 Parent(s): d93b2e5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -17
app.py CHANGED
@@ -12,8 +12,8 @@ from tqdm import tqdm
12
  from datasets import load_dataset
13
  import pandas as pd
14
  from sentence_transformers import SentenceTransformer
15
- # Import config if needed for EphemeralClient settings, though default might be fine
16
- import chromadb.config
17
 
18
  # --- Page Config (MUST BE FIRST Streamlit call) ---
19
  st.set_page_config(layout="wide")
@@ -25,7 +25,7 @@ LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
25
  HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
26
  HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
27
  PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
28
- ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
29
  TOP_K = 10
30
  INITIAL_N_RESULTS = 50
31
  MAX_NEW_TOKENS = 512
@@ -129,12 +129,18 @@ generation_client = initialize_hf_client()
129
  embedding_model = load_local_embedding_model()
130
  # ---
131
 
132
- # --- Setup ChromaDB Collection (using Session State) ---
133
- # This function now attempts to load or create the collection and stores it in session state
134
  def setup_chroma_collection():
 
135
  if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
136
- logging.info("Using existing Chroma collection from session state.")
137
- return st.session_state.chroma_collection
 
 
 
 
 
 
138
 
139
  # Proceed with setup only if essential components are loaded
140
  if not embedding_model or not generation_client:
@@ -147,17 +153,23 @@ def setup_chroma_collection():
147
  st.error("Failed to load embedding data. Cannot initialize vector database.")
148
  return None
149
 
 
 
 
 
 
 
150
  try:
151
- logging.info("Initializing Ephemeral ChromaDB client...")
152
- # Use EphemeralClient explicitly
153
- chroma_client = chromadb.EphemeralClient(
154
- settings=chromadb.config.Settings(
155
- anonymized_telemetry=False, # Optional: Disable telemetry
156
- allow_reset=True # Optional: Allows resetting
157
- )
158
  )
 
 
159
 
160
- # Check if collection exists and delete if it does (robustness)
161
  try:
162
  existing_collections = [col.name for col in chroma_client.list_collections()]
163
  if COLLECTION_NAME in existing_collections:
@@ -166,7 +178,6 @@ def setup_chroma_collection():
166
  except Exception as delete_e:
167
  logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
168
 
169
-
170
  logging.info(f"Creating collection: {COLLECTION_NAME}")
171
  collection_instance = chroma_client.create_collection(
172
  name=COLLECTION_NAME,
@@ -234,7 +245,6 @@ def setup_chroma_collection():
234
  return None
235
 
236
  # --- Initialize collection ---
237
- # Call the setup function which populates session state if needed
238
  collection = setup_chroma_collection()
239
  # ---
240
 
 
12
  from datasets import load_dataset
13
  import pandas as pd
14
  from sentence_transformers import SentenceTransformer
15
+ import tempfile # Added for temporary directory
16
+ import chromadb.config # Added for Settings
17
 
18
  # --- Page Config (MUST BE FIRST Streamlit call) ---
19
  st.set_page_config(layout="wide")
 
25
  HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
26
  HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
27
  PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
28
+ ADD_BATCH_SIZE = 500 # Batch size for adding to Chroma
29
  TOP_K = 10
30
  INITIAL_N_RESULTS = 50
31
  MAX_NEW_TOKENS = 512
 
129
  embedding_model = load_local_embedding_model()
130
  # ---
131
 
132
+ # --- Setup ChromaDB Collection (using Session State and Temp Dir) ---
 
133
  def setup_chroma_collection():
134
+ """Loads data from HF, sets up ChromaDB in a temp dir, populates it, and returns the collection."""
135
  if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
136
+ # Basic check: see if collection is queryable
137
+ try:
138
+ st.session_state.chroma_collection.peek(1) # Try a lightweight operation
139
+ logging.info("Using existing Chroma collection from session state.")
140
+ return st.session_state.chroma_collection
141
+ except Exception as e:
142
+ logging.warning(f"Error accessing existing collection in session state ({e}), re-initializing.")
143
+ st.session_state.chroma_collection = None # Force re-init
144
 
145
  # Proceed with setup only if essential components are loaded
146
  if not embedding_model or not generation_client:
 
153
  st.error("Failed to load embedding data. Cannot initialize vector database.")
154
  return None
155
 
156
+ # Create a temporary directory for this session
157
+ # Note: This directory might be cleaned up automatically depending on the OS/environment
158
+ # In HF Spaces ephemeral storage, it will likely be wiped on restart anyway.
159
+ temp_dir = tempfile.mkdtemp()
160
+ logging.info(f"Created temporary directory for ChromaDB: {temp_dir}")
161
+
162
  try:
163
+ logging.info("Initializing ChromaDB client with temporary storage...")
164
+ settings = chromadb.config.Settings(
165
+ persist_directory=temp_dir,
166
+ anonymized_telemetry=False,
167
+ is_persistent=True # Explicitly set for PersistentClient behavior in temp dir
 
 
168
  )
169
+ # Use the standard Client, but point it to the temp directory
170
+ chroma_client = chromadb.Client(settings=settings)
171
 
172
+ # Check if collection exists and delete if it does
173
  try:
174
  existing_collections = [col.name for col in chroma_client.list_collections()]
175
  if COLLECTION_NAME in existing_collections:
 
178
  except Exception as delete_e:
179
  logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
180
 
 
181
  logging.info(f"Creating collection: {COLLECTION_NAME}")
182
  collection_instance = chroma_client.create_collection(
183
  name=COLLECTION_NAME,
 
245
  return None
246
 
247
  # --- Initialize collection ---
 
248
  collection = setup_chroma_collection()
249
  # ---
250