Zwounds commited on
Commit
5b66564
Β·
verified Β·
1 Parent(s): cab221e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -21
app.py CHANGED
@@ -16,6 +16,10 @@ from sentence_transformers import SentenceTransformer
16
  # Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
17
  # import chromadb.utils.embedding_functions as embedding_functions
18
 
 
 
 
 
19
  # --- Configuration ---
20
  # DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
21
  COLLECTION_NAME = "libguides_content"
@@ -86,39 +90,45 @@ embedding_model = load_local_embedding_model()
86
  # --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
87
  @st.cache_resource
88
  def load_data_and_setup_chroma():
 
89
  if not generation_client or not embedding_model:
90
  st.error("Required clients/models not initialized. Cannot proceed.")
 
91
  st.stop()
92
 
93
  try:
94
  logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
95
- # Load the dataset - might need split='train' if applicable
96
- # Handle potential errors during download/load
97
  try:
98
- dataset = load_dataset(HF_DATASET_ID, split='train') # Assuming default split is 'train'
99
- except Exception as load_e:
100
- logging.error(f"Failed to load dataset '{HF_DATASET_ID}': {load_e}")
101
- st.error(f"Failed to load dataset '{HF_DATASET_ID}'. Check dataset ID and availability.")
 
102
  st.stop()
103
 
104
- logging.info("Converting dataset to Pandas DataFrame...")
105
- df = dataset.to_pandas()
106
  logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
107
 
108
  # Verify required columns
109
  required_cols = ['id', 'document', 'embedding', 'metadata']
110
  if not all(col in df.columns for col in required_cols):
111
- st.error(f"Dataset is missing required columns. Found: {df.columns}. Required: {required_cols}")
112
- logging.error(f"Dataset missing required columns. Found: {df.columns}")
113
  st.stop()
114
 
115
- # Ensure embeddings are lists of floats (Parquet might store them efficiently)
116
- # This might not be strictly necessary if ChromaDB handles numpy arrays, but safer to convert
117
  logging.info("Ensuring embeddings are in list format...")
118
- df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
119
- # Drop rows where embedding conversion failed
 
 
 
 
 
120
  initial_rows = len(df)
121
- df.dropna(subset=['embedding'], inplace=True)
122
  if len(df) < initial_rows:
123
  logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
124
 
@@ -130,16 +140,16 @@ def load_data_and_setup_chroma():
130
  logging.info("Initializing in-memory ChromaDB client...")
131
  chroma_client = chromadb.Client() # In-memory client
132
 
133
- # Delete collection if it somehow exists in memory (unlikely but safe)
134
  try:
135
  chroma_client.delete_collection(name=COLLECTION_NAME)
 
136
  except: pass
137
 
138
  logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
139
- # Create collection WITHOUT embedding function - we provide pre-computed ones
140
  collection = chroma_client.create_collection(
141
  name=COLLECTION_NAME,
142
- metadata={"hnsw:space": "cosine"} # Or dot if BGE prefers
143
  )
144
 
145
  logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
@@ -154,11 +164,26 @@ def load_data_and_setup_chroma():
154
  batch_df = df.iloc[start_idx:end_idx]
155
 
156
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  collection.add(
158
  ids=batch_df['id'].tolist(),
159
  embeddings=batch_df['embedding'].tolist(),
160
  documents=batch_df['document'].tolist(),
161
- metadatas=batch_df['metadata'].tolist()
162
  )
163
  except Exception as e:
164
  logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
@@ -182,7 +207,7 @@ def load_data_and_setup_chroma():
182
  st.error(f"Failed to load data and initialize ChromaDB: {e}")
183
  logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
184
  st.stop()
185
- return None # Should not be reached
186
 
187
  # --- Load data and collection ---
188
  collection = load_data_and_setup_chroma()
@@ -259,7 +284,6 @@ Answer:"""
259
  return prompt
260
 
261
  # --- Streamlit App UI ---
262
- st.set_page_config(layout="wide")
263
  st.title("πŸ“š Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
264
 
265
  # User input (only proceed if collection loaded)
 
16
  # Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
17
  # import chromadb.utils.embedding_functions as embedding_functions
18
 
19
+ # --- Page Config (MUST BE FIRST Streamlit call) ---
20
+ st.set_page_config(layout="wide")
21
+ # ---
22
+
23
  # --- Configuration ---
24
  # DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
25
  COLLECTION_NAME = "libguides_content"
 
90
  # --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
91
  @st.cache_resource
92
  def load_data_and_setup_chroma():
93
+ # Ensure dependent resources are loaded first
94
  if not generation_client or not embedding_model:
95
  st.error("Required clients/models not initialized. Cannot proceed.")
96
+ # Potentially redundant with individual init checks, but safe
97
  st.stop()
98
 
99
  try:
100
  logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
101
+ # Download the specific parquet file from the dataset repo
 
102
  try:
103
+ parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
104
+ logging.info(f"Downloaded dataset file to: {parquet_path}")
105
+ except Exception as download_e:
106
+ logging.error(f"Failed to download dataset file '{PARQUET_FILENAME}' from '{HF_DATASET_ID}': {download_e}")
107
+ st.error(f"Failed to download dataset '{HF_DATASET_ID}'. Check dataset ID, filename, and token permissions.")
108
  st.stop()
109
 
110
+ logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
111
+ df = pd.read_parquet(parquet_path)
112
  logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
113
 
114
  # Verify required columns
115
  required_cols = ['id', 'document', 'embedding', 'metadata']
116
  if not all(col in df.columns for col in required_cols):
117
+ st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
118
+ logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
119
  st.stop()
120
 
121
+ # Ensure embeddings are lists of floats
 
122
  logging.info("Ensuring embeddings are in list format...")
123
+ # Check if the first embedding is already a list of floats, otherwise convert
124
+ if not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float):
125
+ df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
126
+ logging.info("Converted embeddings to list[float].")
127
+ else:
128
+ logging.info("Embeddings already seem to be in list[float] format.")
129
+
130
  initial_rows = len(df)
131
+ df.dropna(subset=['embedding'], inplace=True) # Drop rows where embedding is None
132
  if len(df) < initial_rows:
133
  logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
134
 
 
140
  logging.info("Initializing in-memory ChromaDB client...")
141
  chroma_client = chromadb.Client() # In-memory client
142
 
 
143
  try:
144
  chroma_client.delete_collection(name=COLLECTION_NAME)
145
+ logging.info(f"Deleted existing in-memory collection (if any): {COLLECTION_NAME}")
146
  except: pass
147
 
148
  logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
149
+ # Create collection WITHOUT embedding function
150
  collection = chroma_client.create_collection(
151
  name=COLLECTION_NAME,
152
+ metadata={"hnsw:space": "cosine"}
153
  )
154
 
155
  logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
 
164
  batch_df = df.iloc[start_idx:end_idx]
165
 
166
  try:
167
+ # Convert metadata column if it contains dicts
168
+ metadatas_list = batch_df['metadata'].tolist()
169
+ if metadatas_list and isinstance(metadatas_list[0], dict):
170
+ pass # Already list of dicts
171
+ else:
172
+ # Attempt to parse if they are JSON strings, otherwise use empty dicts
173
+ parsed_metadatas = []
174
+ for item in metadatas_list:
175
+ try:
176
+ parsed = json.loads(item) if isinstance(item, str) else item
177
+ parsed_metadatas.append(parsed if isinstance(parsed, dict) else {})
178
+ except:
179
+ parsed_metadatas.append({})
180
+ metadatas_list = parsed_metadatas
181
+
182
  collection.add(
183
  ids=batch_df['id'].tolist(),
184
  embeddings=batch_df['embedding'].tolist(),
185
  documents=batch_df['document'].tolist(),
186
+ metadatas=metadatas_list
187
  )
188
  except Exception as e:
189
  logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
 
207
  st.error(f"Failed to load data and initialize ChromaDB: {e}")
208
  logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
209
  st.stop()
210
+ return None
211
 
212
  # --- Load data and collection ---
213
  collection = load_data_and_setup_chroma()
 
284
  return prompt
285
 
286
  # --- Streamlit App UI ---
 
287
  st.title("πŸ“š Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
288
 
289
  # User input (only proceed if collection loaded)