Zwounds commited on
Commit
93c51f9
·
verified ·
1 Parent(s): c51456e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -139
app.py CHANGED
@@ -5,43 +5,38 @@ import sys
5
  import json
6
  import os
7
  from dotenv import load_dotenv
8
- from huggingface_hub import InferenceClient, hf_hub_download # Added for dataset download
9
  import numpy as np
10
  import time
11
  from tqdm import tqdm
12
- # Need datasets, pandas, sentence-transformers
13
- from datasets import load_dataset, DatasetDict, Dataset
14
  import pandas as pd
15
  from sentence_transformers import SentenceTransformer
16
- # Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
17
- # import chromadb.utils.embedding_functions as embedding_functions
18
 
19
  # --- Page Config (MUST BE FIRST Streamlit call) ---
20
  st.set_page_config(layout="wide")
21
  # ---
22
 
23
  # --- Configuration ---
24
- # DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
25
  COLLECTION_NAME = "libguides_content"
26
  LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
27
  HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
28
  HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
29
  PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
30
- # INPUT_FILE = 'extracted_content.jsonl' # No longer needed for app runtime
31
- # EMBEDDING_BATCH_SIZE = 100 # Batch size for adding docs to ChromaDB (now done during load)
32
  ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
33
  TOP_K = 10
34
  INITIAL_N_RESULTS = 50
35
- API_RETRY_DELAY = 2
36
  MAX_NEW_TOKENS = 512
37
  # ---
38
 
39
  # Setup logging
40
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
41
 
42
- # --- Load API Key and Initialize HF Generation Client ---
 
43
  @st.cache_resource
44
  def initialize_hf_client():
 
45
  generation_client_instance = None
46
  try:
47
  load_dotenv()
@@ -60,12 +55,9 @@ def initialize_hf_client():
60
  st.stop()
61
  return None
62
 
63
- generation_client = initialize_hf_client()
64
- # ---
65
-
66
- # --- Load Local Embedding Model (for Queries) ---
67
  @st.cache_resource
68
  def load_local_embedding_model():
 
69
  logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
70
  try:
71
  import torch
@@ -84,37 +76,26 @@ def load_local_embedding_model():
84
  st.stop()
85
  return None
86
 
87
- embedding_model = load_local_embedding_model()
88
- # ---
89
-
90
- # --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
91
  @st.cache_resource
92
- def load_data_and_setup_chroma():
93
- # Ensure dependent resources are loaded first
94
- if not generation_client or not embedding_model:
95
- st.error("Required clients/models not initialized. Cannot proceed.")
96
- st.stop()
97
-
98
  try:
99
- logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
100
- try:
101
- parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
102
- logging.info(f"Downloaded dataset file to: {parquet_path}")
103
- except Exception as download_e:
104
- logging.error(f"Failed to download dataset file '{PARQUET_FILENAME}' from '{HF_DATASET_ID}': {download_e}")
105
- st.error(f"Failed to download dataset '{HF_DATASET_ID}'. Check dataset ID, filename, and token permissions.")
106
- st.stop()
107
 
108
  logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
109
  df = pd.read_parquet(parquet_path)
110
  logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
111
 
 
112
  required_cols = ['id', 'document', 'embedding', 'metadata']
113
  if not all(col in df.columns for col in required_cols):
114
  st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
115
  logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
116
- st.stop()
117
 
 
118
  logging.info("Ensuring embeddings are in list format...")
119
  if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
120
  df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
@@ -130,131 +111,131 @@ def load_data_and_setup_chroma():
130
  if df.empty:
131
  st.error("No valid data loaded from the dataset after processing embeddings.")
132
  logging.error("DataFrame empty after embedding processing.")
133
- st.stop()
134
 
135
- logging.info("Initializing in-memory ChromaDB client...")
136
- # Explicitly configure for in-memory using DuckDB+Parquet
137
- settings = chromadb.config.Settings(
138
- chroma_api_impl="local",
139
- chroma_db_impl="duckdb+parquet",
140
- persist_directory=None # Ensure no persistence is attempted
141
- )
142
- chroma_client = chromadb.Client(settings=settings)
143
-
144
- try:
145
- chroma_client.delete_collection(name=COLLECTION_NAME)
146
- logging.info(f"Deleted existing in-memory collection (if any): {COLLECTION_NAME}")
147
- except: pass
148
-
149
- logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
150
- collection = chroma_client.create_collection(
151
- name=COLLECTION_NAME,
152
- metadata={"hnsw:space": "cosine"}
153
- )
154
-
155
- logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
156
- start_time = time.time()
157
- error_count = 0
158
- num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
159
- progress_bar = st.progress(0, text="Loading embeddings into memory...")
160
-
161
- for i in range(num_batches):
162
- start_idx = i * ADD_BATCH_SIZE
163
- end_idx = start_idx + ADD_BATCH_SIZE
164
- batch_df = df.iloc[start_idx:end_idx]
165
-
166
- try:
167
- # Prepare metadata for the batch
168
- metadatas_list_raw = batch_df['metadata'].tolist()
169
- cleaned_metadatas = []
170
- for item in metadatas_list_raw:
171
- cleaned_dict = {}
172
- # Handle potential non-dict items loaded from parquet/dataset
173
- if isinstance(item, dict):
174
- current_meta = item
175
- else:
176
- try: # Attempt to parse if it's a JSON string
177
- current_meta = json.loads(item) if isinstance(item, str) else {}
178
- except:
179
- current_meta = {} # Default to empty dict if not dict or valid JSON
180
-
181
- # Clean None values within the dictionary
182
- if isinstance(current_meta, dict):
183
- for key, value in current_meta.items():
184
- if value is None:
185
- cleaned_dict[key] = "" # Replace None with empty string
186
- elif isinstance(value, (str, int, float, bool)):
187
- cleaned_dict[key] = value # Keep allowed types
188
- else:
189
- try: # Attempt to convert others to string
190
- cleaned_dict[key] = str(value)
191
- logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
192
- except:
193
- logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
194
- cleaned_metadatas.append(cleaned_dict)
195
-
196
- # Add the batch with cleaned metadata
197
- collection.add(
198
- ids=batch_df['id'].tolist(),
199
- embeddings=batch_df['embedding'].tolist(),
200
- documents=batch_df['document'].tolist(),
201
- metadatas=cleaned_metadatas # Use the cleaned list
202
- )
203
- except Exception as e:
204
- logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
205
- error_count += 1
206
- progress_bar.progress((i + 1) / num_batches, text=f"Loading embeddings... Batch {i+1}/{num_batches}")
207
-
208
- progress_bar.empty()
209
- end_time = time.time()
210
- logging.info(f"Finished loading data into in-memory ChromaDB. Took {end_time - start_time:.2f} seconds.")
211
- if error_count > 0:
212
- logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
213
-
214
- # Verify count after adding
215
- final_count = collection.count()
216
- logging.info(f"Final document count in Chroma collection: {final_count}")
217
- if final_count == 0 and len(df) > 0:
218
- st.warning("ChromaDB collection is empty after attempting to add documents. Check logs for errors.")
219
- # Don't necessarily stop, but warn the user.
220
-
221
- st.success("Embeddings loaded successfully!")
222
- return collection
223
 
224
  except ImportError as e:
225
  st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
226
- logging.error(f"ImportError during dataset loading/Chroma setup: {e}")
227
- st.stop()
228
  except Exception as e:
229
- st.error(f"Failed to load data and initialize ChromaDB: {e}")
230
- logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
231
- st.stop()
232
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- # --- Load data and collection ---
235
- collection = load_data_and_setup_chroma()
 
 
 
 
 
 
236
  # ---
237
 
238
  # --- Helper Functions ---
239
  def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
240
  """Sends the prompt to the HF Inference API using the initialized client."""
 
241
  if not client_instance:
242
- client_instance = generation_client
243
- if not client_instance:
244
- logging.error("HF Inference client not initialized in query_hf_inference.")
245
  return "Error: HF Inference client failed to initialize."
246
  try:
247
  response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
248
  if not response_text:
249
- logging.warning(f"Received empty response from HF Inference API ({model_name}) for prompt: {prompt[:100]}...")
250
  return "Error: Received empty response from generation model."
251
  return response_text.strip()
252
  except Exception as e:
253
- logging.exception(f"An unexpected error occurred while querying HF Inference API ({model_name}): {e}")
254
  return f"Error: An unexpected error occurred while generating the answer using {model_name}."
255
 
256
  def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
257
  """Uses LLM (HF Inference API) to generate alternative phrasings."""
 
258
  prompt = f"""Given the user query: "{query}"
259
  Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
260
  Focus on synonyms, different levels of specificity, and related concepts.
@@ -287,8 +268,10 @@ Output:"""
287
  logging.error(f"Failed to generate query variations: {e}")
288
  return []
289
 
 
290
  def generate_prompt(query, context_chunks):
291
  """Generates a prompt for the LLM."""
 
292
  context_str = "\n\n".join(context_chunks)
293
  liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
294
  prompt = f"""Based on the following context from the library guides, answer the user's question.
@@ -306,14 +289,13 @@ Answer:"""
306
  return prompt
307
 
308
  # --- Streamlit App UI ---
309
- st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
310
 
311
- # User input (only proceed if collection loaded)
312
  if collection:
313
  query = st.text_area("Enter your question:", height=100)
314
  else:
315
- # Error handled during load_data_and_setup_chroma
316
- st.error("Application initialization failed. Cannot proceed.")
317
  st.stop()
318
 
319
  # --- Routing Prompt Definition ---
 
5
  import json
6
  import os
7
  from dotenv import load_dotenv
8
+ from huggingface_hub import InferenceClient, hf_hub_download
9
  import numpy as np
10
  import time
11
  from tqdm import tqdm
12
+ from datasets import load_dataset
 
13
  import pandas as pd
14
  from sentence_transformers import SentenceTransformer
 
 
15
 
16
  # --- Page Config (MUST BE FIRST Streamlit call) ---
17
  st.set_page_config(layout="wide")
18
  # ---
19
 
20
  # --- Configuration ---
 
21
  COLLECTION_NAME = "libguides_content"
22
  LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
23
  HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
24
  HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
25
  PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
 
 
26
  ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
27
  TOP_K = 10
28
  INITIAL_N_RESULTS = 50
 
29
  MAX_NEW_TOKENS = 512
30
  # ---
31
 
32
  # Setup logging
33
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
34
 
35
+ # --- Cached Resource Loading ---
36
+
37
  @st.cache_resource
38
  def initialize_hf_client():
39
+ """Initializes and returns the HF Inference Client for generation."""
40
  generation_client_instance = None
41
  try:
42
  load_dotenv()
 
55
  st.stop()
56
  return None
57
 
 
 
 
 
58
  @st.cache_resource
59
  def load_local_embedding_model():
60
+ """Loads and returns the local Sentence Transformer model for query embedding."""
61
  logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
62
  try:
63
  import torch
 
76
  st.stop()
77
  return None
78
 
 
 
 
 
79
  @st.cache_resource
80
+ def load_dataset_from_hf():
81
+ """Downloads the dataset parquet file and loads it into a Pandas DataFrame."""
 
 
 
 
82
  try:
83
+ logging.info(f"Downloading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
84
+ parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
85
+ logging.info(f"Downloaded dataset file to: {parquet_path}")
 
 
 
 
 
86
 
87
  logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
88
  df = pd.read_parquet(parquet_path)
89
  logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
90
 
91
+ # Verify required columns
92
  required_cols = ['id', 'document', 'embedding', 'metadata']
93
  if not all(col in df.columns for col in required_cols):
94
  st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
95
  logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
96
+ return None # Return None on error
97
 
98
+ # Ensure embeddings are lists of floats
99
  logging.info("Ensuring embeddings are in list format...")
100
  if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
101
  df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
 
111
  if df.empty:
112
  st.error("No valid data loaded from the dataset after processing embeddings.")
113
  logging.error("DataFrame empty after embedding processing.")
114
+ return None # Return None on error
115
 
116
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  except ImportError as e:
119
  st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
120
+ logging.error(f"ImportError during dataset loading: {e}")
 
121
  except Exception as e:
122
+ st.error(f"Failed to load data from dataset: {e}")
123
+ logging.exception(f"An unexpected error occurred during data load: {e}")
124
+
125
+ return None # Return None on any error
126
+
127
+ # --- Initialize Clients and Models ---
128
+ generation_client = initialize_hf_client()
129
+ embedding_model = load_local_embedding_model()
130
+ # ---
131
+
132
+ # --- Setup ChromaDB Collection (using Session State) ---
133
+ if 'chroma_collection' not in st.session_state:
134
+ st.session_state.chroma_collection = None
135
+ if embedding_model and generation_client: # Only proceed if models/clients loaded
136
+ with st.spinner("Loading and preparing vector database..."):
137
+ df = load_dataset_from_hf()
138
+ if df is not None and not df.empty:
139
+ try:
140
+ logging.info("Initializing Ephemeral ChromaDB client...")
141
+ chroma_client = chromadb.EphemeralClient() # Use Ephemeral Client
142
+
143
+ # Delete collection if it somehow exists (unlikely for ephemeral)
144
+ try:
145
+ chroma_client.delete_collection(name=COLLECTION_NAME)
146
+ logging.info(f"Deleted existing collection (if any): {COLLECTION_NAME}")
147
+ except: pass
148
+
149
+ logging.info(f"Creating collection: {COLLECTION_NAME}")
150
+ collection_instance = chroma_client.create_collection(
151
+ name=COLLECTION_NAME,
152
+ metadata={"hnsw:space": "cosine"}
153
+ )
154
+
155
+ logging.info(f"Adding {len(df)} documents to ChromaDB in batches of {ADD_BATCH_SIZE}...")
156
+ start_time = time.time()
157
+ error_count = 0
158
+ num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
159
+
160
+ for i in range(num_batches):
161
+ start_idx = i * ADD_BATCH_SIZE
162
+ end_idx = start_idx + ADD_BATCH_SIZE
163
+ batch_df = df.iloc[start_idx:end_idx]
164
+
165
+ try:
166
+ # Prepare and clean metadata for the batch
167
+ metadatas_list_raw = batch_df['metadata'].tolist()
168
+ cleaned_metadatas = []
169
+ for item in metadatas_list_raw:
170
+ cleaned_dict = {}
171
+ if isinstance(item, dict):
172
+ current_meta = item
173
+ else:
174
+ try: current_meta = json.loads(item) if isinstance(item, str) else {}
175
+ except: current_meta = {}
176
+
177
+ if isinstance(current_meta, dict):
178
+ for key, value in current_meta.items():
179
+ if value is None: cleaned_dict[key] = ""
180
+ elif isinstance(value, (str, int, float, bool)): cleaned_dict[key] = value
181
+ else:
182
+ try: cleaned_dict[key] = str(value)
183
+ except: pass # Skip unconvertible types
184
+ cleaned_metadatas.append(cleaned_dict)
185
+
186
+ # Add the batch
187
+ collection_instance.add(
188
+ ids=batch_df['id'].tolist(),
189
+ embeddings=batch_df['embedding'].tolist(),
190
+ documents=batch_df['document'].tolist(),
191
+ metadatas=cleaned_metadatas
192
+ )
193
+ except Exception as e:
194
+ logging.error(f"Error adding batch {i+1}/{num_batches} to Chroma: {e}")
195
+ error_count += 1
196
+
197
+ end_time = time.time()
198
+ logging.info(f"Finished loading data into ChromaDB. Took {end_time - start_time:.2f} seconds.")
199
+ if error_count > 0: logging.warning(f"Encountered errors in {error_count} batches during add.")
200
+
201
+ final_count = collection_instance.count()
202
+ logging.info(f"Final document count in Chroma collection: {final_count}")
203
+ if final_count > 0:
204
+ st.session_state.chroma_collection = collection_instance
205
+ st.success("Vector database loaded successfully!")
206
+ else:
207
+ st.error("Failed to load documents into the vector database.")
208
 
209
+ except Exception as setup_e:
210
+ st.error(f"Failed to setup ChromaDB: {setup_e}")
211
+ logging.exception(f"Failed to setup ChromaDB: {setup_e}")
212
+ else:
213
+ st.error("Failed to load data from the dataset. Cannot initialize database.")
214
+
215
+ # Assign collection from session state for use in the app
216
+ collection = st.session_state.get('chroma_collection', None)
217
  # ---
218
 
219
  # --- Helper Functions ---
220
  def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
221
  """Sends the prompt to the HF Inference API using the initialized client."""
222
+ if not client_instance: client_instance = generation_client
223
  if not client_instance:
224
+ logging.error("HF Inference client not initialized.")
 
 
225
  return "Error: HF Inference client failed to initialize."
226
  try:
227
  response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
228
  if not response_text:
229
+ logging.warning(f"Received empty response from HF Inference API ({model_name}).")
230
  return "Error: Received empty response from generation model."
231
  return response_text.strip()
232
  except Exception as e:
233
+ logging.exception(f"Error querying HF Inference API ({model_name}): {e}")
234
  return f"Error: An unexpected error occurred while generating the answer using {model_name}."
235
 
236
  def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
237
  """Uses LLM (HF Inference API) to generate alternative phrasings."""
238
+ # ... (rest of function remains the same) ...
239
  prompt = f"""Given the user query: "{query}"
240
  Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
241
  Focus on synonyms, different levels of specificity, and related concepts.
 
268
  logging.error(f"Failed to generate query variations: {e}")
269
  return []
270
 
271
+
272
  def generate_prompt(query, context_chunks):
273
  """Generates a prompt for the LLM."""
274
+ # ... (function remains the same) ...
275
  context_str = "\n\n".join(context_chunks)
276
  liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
277
  prompt = f"""Based on the following context from the library guides, answer the user's question.
 
289
  return prompt
290
 
291
  # --- Streamlit App UI ---
292
+ st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)")
293
 
294
+ # User input (only proceed if collection is ready)
295
  if collection:
296
  query = st.text_area("Enter your question:", height=100)
297
  else:
298
+ st.error("Application initialization failed: Vector database not loaded.")
 
299
  st.stop()
300
 
301
  # --- Routing Prompt Definition ---