Zwounds commited on
Commit
c51456e
·
verified ·
1 Parent(s): d51ec77

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -36
app.py CHANGED
@@ -93,12 +93,10 @@ def load_data_and_setup_chroma():
93
  # Ensure dependent resources are loaded first
94
  if not generation_client or not embedding_model:
95
  st.error("Required clients/models not initialized. Cannot proceed.")
96
- # Potentially redundant with individual init checks, but safe
97
  st.stop()
98
 
99
  try:
100
  logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
101
- # Download the specific parquet file from the dataset repo
102
  try:
103
  parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
104
  logging.info(f"Downloaded dataset file to: {parquet_path}")
@@ -111,24 +109,21 @@ def load_data_and_setup_chroma():
111
  df = pd.read_parquet(parquet_path)
112
  logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
113
 
114
- # Verify required columns
115
  required_cols = ['id', 'document', 'embedding', 'metadata']
116
  if not all(col in df.columns for col in required_cols):
117
  st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
118
  logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
119
  st.stop()
120
 
121
- # Ensure embeddings are lists of floats
122
  logging.info("Ensuring embeddings are in list format...")
123
- # Check if the first embedding is already a list of floats, otherwise convert
124
- if not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float):
125
  df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
126
  logging.info("Converted embeddings to list[float].")
127
  else:
128
- logging.info("Embeddings already seem to be in list[float] format.")
129
 
130
  initial_rows = len(df)
131
- df.dropna(subset=['embedding'], inplace=True) # Drop rows where embedding is None
132
  if len(df) < initial_rows:
133
  logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
134
 
@@ -138,7 +133,13 @@ def load_data_and_setup_chroma():
138
  st.stop()
139
 
140
  logging.info("Initializing in-memory ChromaDB client...")
141
- chroma_client = chromadb.Client() # In-memory client
 
 
 
 
 
 
142
 
143
  try:
144
  chroma_client.delete_collection(name=COLLECTION_NAME)
@@ -146,7 +147,6 @@ def load_data_and_setup_chroma():
146
  except: pass
147
 
148
  logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
149
- # Create collection WITHOUT embedding function
150
  collection = chroma_client.create_collection(
151
  name=COLLECTION_NAME,
152
  metadata={"hnsw:space": "cosine"}
@@ -164,47 +164,41 @@ def load_data_and_setup_chroma():
164
  batch_df = df.iloc[start_idx:end_idx]
165
 
166
  try:
167
- # Convert metadata column if it contains dicts
168
- metadatas_list = batch_df['metadata'].tolist()
169
- if metadatas_list and isinstance(metadatas_list[0], dict):
170
- pass # Already list of dicts
171
- else:
172
- # Attempt to parse if they are JSON strings, otherwise use empty dicts
173
- parsed_metadatas = []
174
- for item in metadatas_list:
175
- try:
176
- parsed = json.loads(item) if isinstance(item, str) else item
177
- parsed_metadatas.append(parsed if isinstance(parsed, dict) else {})
178
- except:
179
- parsed_metadatas.append({})
180
- metadatas_list = parsed_metadatas # This line has the wrong indentation
181
-
182
- # --- Clean None values from metadata ---
183
  cleaned_metadatas = []
184
- for meta_dict in metadatas_list:
185
  cleaned_dict = {}
186
- if isinstance(meta_dict, dict):
187
- for key, value in meta_dict.items():
188
- # Replace None with empty string, keep other valid types
 
 
 
 
 
 
 
 
 
189
  if value is None:
190
- cleaned_dict[key] = ""
191
  elif isinstance(value, (str, int, float, bool)):
192
- cleaned_dict[key] = value
193
  else:
194
- # Attempt to convert other types to string, or skip
195
- try:
196
  cleaned_dict[key] = str(value)
197
  logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
198
  except:
199
  logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
200
  cleaned_metadatas.append(cleaned_dict)
201
- # -----------------------------------------
202
 
 
203
  collection.add(
204
  ids=batch_df['id'].tolist(),
205
  embeddings=batch_df['embedding'].tolist(),
206
  documents=batch_df['document'].tolist(),
207
- metadatas=cleaned_metadatas # Use cleaned list
208
  )
209
  except Exception as e:
210
  logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
@@ -217,6 +211,13 @@ def load_data_and_setup_chroma():
217
  if error_count > 0:
218
  logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
219
 
 
 
 
 
 
 
 
220
  st.success("Embeddings loaded successfully!")
221
  return collection
222
 
 
93
  # Ensure dependent resources are loaded first
94
  if not generation_client or not embedding_model:
95
  st.error("Required clients/models not initialized. Cannot proceed.")
 
96
  st.stop()
97
 
98
  try:
99
  logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
 
100
  try:
101
  parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
102
  logging.info(f"Downloaded dataset file to: {parquet_path}")
 
109
  df = pd.read_parquet(parquet_path)
110
  logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
111
 
 
112
  required_cols = ['id', 'document', 'embedding', 'metadata']
113
  if not all(col in df.columns for col in required_cols):
114
  st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
115
  logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
116
  st.stop()
117
 
 
118
  logging.info("Ensuring embeddings are in list format...")
119
+ if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
 
120
  df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
121
  logging.info("Converted embeddings to list[float].")
122
  else:
123
+ logging.info("Embeddings already seem to be in list[float] format or DataFrame is empty.")
124
 
125
  initial_rows = len(df)
126
+ df.dropna(subset=['embedding'], inplace=True)
127
  if len(df) < initial_rows:
128
  logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
129
 
 
133
  st.stop()
134
 
135
  logging.info("Initializing in-memory ChromaDB client...")
136
+ # Explicitly configure for in-memory using DuckDB+Parquet
137
+ settings = chromadb.config.Settings(
138
+ chroma_api_impl="local",
139
+ chroma_db_impl="duckdb+parquet",
140
+ persist_directory=None # Ensure no persistence is attempted
141
+ )
142
+ chroma_client = chromadb.Client(settings=settings)
143
 
144
  try:
145
  chroma_client.delete_collection(name=COLLECTION_NAME)
 
147
  except: pass
148
 
149
  logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
 
150
  collection = chroma_client.create_collection(
151
  name=COLLECTION_NAME,
152
  metadata={"hnsw:space": "cosine"}
 
164
  batch_df = df.iloc[start_idx:end_idx]
165
 
166
  try:
167
+ # Prepare metadata for the batch
168
+ metadatas_list_raw = batch_df['metadata'].tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  cleaned_metadatas = []
170
+ for item in metadatas_list_raw:
171
  cleaned_dict = {}
172
+ # Handle potential non-dict items loaded from parquet/dataset
173
+ if isinstance(item, dict):
174
+ current_meta = item
175
+ else:
176
+ try: # Attempt to parse if it's a JSON string
177
+ current_meta = json.loads(item) if isinstance(item, str) else {}
178
+ except:
179
+ current_meta = {} # Default to empty dict if not dict or valid JSON
180
+
181
+ # Clean None values within the dictionary
182
+ if isinstance(current_meta, dict):
183
+ for key, value in current_meta.items():
184
  if value is None:
185
+ cleaned_dict[key] = "" # Replace None with empty string
186
  elif isinstance(value, (str, int, float, bool)):
187
+ cleaned_dict[key] = value # Keep allowed types
188
  else:
189
+ try: # Attempt to convert others to string
 
190
  cleaned_dict[key] = str(value)
191
  logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
192
  except:
193
  logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
194
  cleaned_metadatas.append(cleaned_dict)
 
195
 
196
+ # Add the batch with cleaned metadata
197
  collection.add(
198
  ids=batch_df['id'].tolist(),
199
  embeddings=batch_df['embedding'].tolist(),
200
  documents=batch_df['document'].tolist(),
201
+ metadatas=cleaned_metadatas # Use the cleaned list
202
  )
203
  except Exception as e:
204
  logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
 
211
  if error_count > 0:
212
  logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
213
 
214
+ # Verify count after adding
215
+ final_count = collection.count()
216
+ logging.info(f"Final document count in Chroma collection: {final_count}")
217
+ if final_count == 0 and len(df) > 0:
218
+ st.warning("ChromaDB collection is empty after attempting to add documents. Check logs for errors.")
219
+ # Don't necessarily stop, but warn the user.
220
+
221
  st.success("Embeddings loaded successfully!")
222
  return collection
223