Zwounds commited on
Commit
cab221e
·
verified ·
1 Parent(s): b10473a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -195
app.py CHANGED
@@ -7,46 +7,45 @@ import os
7
  from dotenv import load_dotenv
8
  from huggingface_hub import InferenceClient
9
  import numpy as np
10
- import time # Added for embedding delay/timing
11
- from tqdm import tqdm # Added for embedding progress
12
- # Import ChromaDB's helper for Sentence Transformers
13
- import chromadb.utils.embedding_functions as embedding_functions
14
- # from sentence_transformers import CrossEncoder # Keep if re-ranking might be used
15
-
16
- # --- Page Config (Must be first Streamlit command) ---
17
- st.set_page_config(layout="wide")
18
- # ---
19
 
20
  # --- Configuration ---
21
- DB_PATH = "./chroma_db"
22
- COLLECTION_NAME = "libguides_content" # Must match the embedding script
23
- LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for ChromaDB's function
24
  HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
25
- INPUT_FILE = 'extracted_content.jsonl' # Source data for embedding
26
- EMBEDDING_BATCH_SIZE = 100 # Batch size for adding docs to ChromaDB
27
- # CROSS_ENCODER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # Model for re-ranking (DISABLED)
28
- TOP_K = 10 # Number of *final* unique chunks to send to LLM
29
- INITIAL_N_RESULTS = 50 # Number of candidates from initial vector search
30
- API_RETRY_DELAY = 2 # Delay for generation API if needed
31
- MAX_NEW_TOKENS = 512 # Max tokens for HF text generation
 
 
32
  # ---
33
 
34
  # Setup logging
35
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
36
 
37
  # --- Load API Key and Initialize HF Generation Client ---
38
- # Wrap client initialization in a cached function to avoid re-initializing on every interaction
39
  @st.cache_resource
40
  def initialize_hf_client():
41
  generation_client_instance = None
42
  try:
43
  load_dotenv()
44
- # Read HF_TOKEN from environment variable first (for Spaces secrets), fallback to .env
45
  HF_TOKEN = os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
46
  if not HF_TOKEN:
47
- logging.error("HF_TOKEN or HUGGING_FACE_HUB_TOKEN not found in environment variables or .env file.")
48
- st.error("🔴 Hugging Face Token not found. Please set it as a Space secret named HF_TOKEN or in the .env file as HUGGING_FACE_HUB_TOKEN.")
49
- st.stop() # Stop execution if token is missing
50
  else:
51
  generation_client_instance = InferenceClient(model=HF_GENERATION_MODEL, token=HF_TOKEN)
52
  logging.info(f"Initialized HF Inference Client for generation ({HF_GENERATION_MODEL}).")
@@ -54,18 +53,16 @@ def initialize_hf_client():
54
  except Exception as e:
55
  logging.exception("Error initializing Hugging Face Inference Client for generation.")
56
  st.error(f"🔴 Error initializing Hugging Face Inference Client: {e}")
57
- st.stop() # Stop execution on error
58
- return None # Should not be reached if st.stop() works
59
 
60
  generation_client = initialize_hf_client()
61
  # ---
62
 
63
- # --- Embedding Function Definition (Needed for DB creation) ---
64
- # This part is similar to embed_and_store_local_chroma_ef.py
65
- # Cache the embedding function definition as well
66
  @st.cache_resource
67
- def get_embedding_function():
68
- logging.info(f"Defining embedding function for model: {LOCAL_EMBEDDING_MODEL}")
69
  try:
70
  import torch
71
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -73,175 +70,134 @@ def get_embedding_function():
73
  except ImportError:
74
  device = 'cpu'
75
  logging.info("Torch not found, using device: cpu")
76
-
77
  try:
78
- ef = embedding_functions.SentenceTransformerEmbeddingFunction(
79
- model_name=LOCAL_EMBEDDING_MODEL,
80
- device=device,
81
- trust_remote_code=True
82
- )
83
- logging.info("Embedding function defined.")
84
- return ef
85
  except Exception as e:
86
- st.error(f"Failed to initialize embedding function ({LOCAL_EMBEDDING_MODEL}): {e}")
87
- logging.exception(f"Failed to initialize embedding function: {e}")
88
- return None
 
89
 
90
- # --- Function to Create and Populate DB ---
91
- # This integrates logic from embed_and_store_local_chroma_ef.py
92
- # Use a simple flag file to check if initialization was done in this session/container lifetime
93
- INIT_FLAG_FILE = os.path.join(DB_PATH, ".initialized")
94
-
95
- def initialize_database():
96
- # Check if DB exists and is initialized (using flag file for ephemeral systems)
97
- if os.path.exists(INIT_FLAG_FILE):
98
- logging.info("Initialization flag file found. Assuming DB is ready.")
99
- return True
100
-
101
- # Check if DB path exists but maybe wasn't fully initialized
102
- db_exists = os.path.exists(DB_PATH) and os.listdir(DB_PATH)
103
-
104
- if db_exists and not os.path.exists(INIT_FLAG_FILE):
105
- logging.warning("DB path exists but initialization flag not found. Re-initializing.")
106
- # Optionally, could try loading collection here and return True if successful
107
- # For simplicity, we'll just re-initialize fully if flag is missing
108
 
109
- st.warning(f"ChromaDB not found or needs initialization at {DB_PATH}. Initializing and embedding data... This may take a while.")
110
- logging.info(f"Database not found or needs initialization. Running embedding process...")
 
 
 
 
111
 
112
  try:
113
- ef = get_embedding_function()
114
- if not ef: return False # Stop if embedding function failed
115
-
116
- # Load Data
117
- logging.info(f"Loading data from {INPUT_FILE}...")
118
- if not os.path.exists(INPUT_FILE):
119
- st.error(f"Source data file '{INPUT_FILE}' not found. Cannot create database.")
120
- logging.error(f"Source data file '{INPUT_FILE}' not found.")
121
- return False
122
- documents = []
123
- metadatas = []
124
- ids = []
125
- with open(INPUT_FILE, 'r', encoding='utf-8') as f:
126
- progress_bar = st.progress(0, text="Loading data...")
127
- lines = f.readlines()
128
- for i, line in enumerate(lines):
129
- try:
130
- data = json.loads(line)
131
- text = data.get('text')
132
- if not text: continue
133
- documents.append(text)
134
- metadata = data.get('metadata', {})
135
- if not isinstance(metadata, dict): metadata = {}
136
- metadatas.append(metadata)
137
- ids.append(f"doc_{i}")
138
- except Exception as e:
139
- logging.warning(f"Error processing line {i+1}: {e}")
140
- progress_bar.progress((i + 1) / len(lines), text=f"Loading data... {i+1}/{len(lines)}")
141
- progress_bar.empty()
142
-
143
- logging.info(f"Loaded {len(documents)} valid documents.")
144
- if not documents:
145
- st.error("No valid documents loaded from source file.")
146
- logging.error("No valid documents loaded.")
147
- return False
148
-
149
- # Setup Vector DB
150
- logging.info(f"Initializing ChromaDB client at path: {DB_PATH}")
151
- chroma_client = chromadb.PersistentClient(path=DB_PATH)
152
-
153
  try:
154
  chroma_client.delete_collection(name=COLLECTION_NAME)
155
- logging.info(f"Deleted existing collection (if any): {COLLECTION_NAME}")
156
- except Exception: pass
157
 
158
- logging.info(f"Creating new collection '{COLLECTION_NAME}' with embedding function.")
 
159
  collection = chroma_client.create_collection(
160
  name=COLLECTION_NAME,
161
- embedding_function=ef,
162
- metadata={"hnsw:space": "cosine"}
163
  )
164
- logging.info(f"Created new collection '{COLLECTION_NAME}'.")
165
 
166
- # Add Documents in Batches
167
- logging.info(f"Adding documents to ChromaDB (ChromaDB will embed)...")
168
  start_time = time.time()
169
- total_added = 0
170
  error_count = 0
171
- num_batches = (len(documents) + EMBEDDING_BATCH_SIZE - 1) // EMBEDDING_BATCH_SIZE
172
- progress_bar = st.progress(0, text="Embedding documents (this takes time)...")
173
 
174
  for i in range(num_batches):
175
- start_idx = i * EMBEDDING_BATCH_SIZE
176
- end_idx = start_idx + EMBEDDING_BATCH_SIZE
177
- batch_docs = documents[start_idx:end_idx]
178
- batch_metadatas = metadatas[start_idx:end_idx]
179
- batch_ids = ids[start_idx:end_idx]
180
 
181
  try:
182
- collection.add(documents=batch_docs, metadatas=batch_metadatas, ids=batch_ids)
183
- total_added += len(batch_ids)
 
 
 
 
184
  except Exception as e:
185
- logging.error(f"Error adding batch starting at index {start_idx}: {e}")
186
  error_count += 1
187
- progress_bar.progress((i + 1) / num_batches, text=f"Embedding documents... Batch {i+1}/{num_batches}")
188
 
189
  progress_bar.empty()
190
  end_time = time.time()
191
- logging.info(f"Finished adding documents process.")
192
- logging.info(f"Successfully added {total_added} documents to ChromaDB.")
193
  if error_count > 0:
194
- logging.warning(f"Encountered errors in {error_count} batches during add.")
195
- logging.info(f"Document adding took {end_time - start_time:.2f} seconds.")
196
-
197
- # Create flag file on success
198
- os.makedirs(DB_PATH, exist_ok=True)
199
- with open(INIT_FLAG_FILE, 'w') as f:
200
- f.write('initialized')
201
-
202
- st.success(f"Database initialized successfully with {total_added} documents.")
203
- return True
204
 
205
- except Exception as e:
206
- st.error(f"Failed to initialize database: {e}")
207
- logging.exception(f"An unexpected error occurred during database initialization: {e}")
208
- return False
209
-
210
-
211
- # --- Caching Functions ---
212
- # Modified to depend on successful DB initialization
213
- @st.cache_resource
214
- def load_chromadb_collection():
215
- if not initialize_database():
216
- st.error("Database initialization failed. Cannot load collection.")
217
- st.stop()
218
-
219
- logging.info(f"Attempting to load ChromaDB collection: {COLLECTION_NAME}")
220
- try:
221
- _client = chromadb.PersistentClient(path=DB_PATH)
222
- collection = _client.get_collection(name=COLLECTION_NAME)
223
- logging.info(f"Collection '{COLLECTION_NAME}' loaded successfully.")
224
  return collection
 
 
 
 
 
225
  except Exception as e:
226
- st.error(f"Failed to load ChromaDB collection '{COLLECTION_NAME}' after initialization attempt: {e}")
227
- logging.error(f"Failed to load ChromaDB collection after initialization attempt: {e}")
228
- return None
 
229
 
 
 
 
230
 
231
  # --- Helper Functions ---
232
  def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
233
  """Sends the prompt to the HF Inference API using the initialized client."""
234
  if not client_instance:
235
  client_instance = generation_client
236
-
237
  if not client_instance:
238
  logging.error("HF Inference client not initialized in query_hf_inference.")
239
  return "Error: HF Inference client failed to initialize."
240
  try:
241
- response_text = client_instance.text_generation(
242
- prompt,
243
- max_new_tokens=MAX_NEW_TOKENS,
244
- )
245
  if not response_text:
246
  logging.warning(f"Received empty response from HF Inference API ({model_name}) for prompt: {prompt[:100]}...")
247
  return "Error: Received empty response from generation model."
@@ -271,7 +227,6 @@ commencement schedule
271
 
272
  User Query: "{query}"
273
  Output:"""
274
-
275
  logging.info(f"Generating query variations for: {query} using {model_name}")
276
  try:
277
  response = llm_func(prompt, model_name=model_name)
@@ -304,18 +259,16 @@ Answer:"""
304
  return prompt
305
 
306
  # --- Streamlit App UI ---
307
- # st.set_page_config(layout="wide") # MOVED TO TOP
308
- st.title("📚 Ask the Library Guides (Local Embed + HF Gen)") # Updated title
309
-
310
- # Load resources (this now includes the initialization check)
311
- collection = load_chromadb_collection()
312
 
313
  # User input (only proceed if collection loaded)
314
  if collection:
315
  query = st.text_area("Enter your question:", height=100)
316
  else:
317
- st.error("Application cannot proceed: Failed to load or initialize ChromaDB collection.")
318
- st.stop() # Stop if collection failed to load
 
319
 
320
  # --- Routing Prompt Definition ---
321
  ROUTING_PROMPT_TEMPLATE = """You are a query routing assistant for a library chatbot. Your task is to classify the user's query into one of the following categories based on its intent:
@@ -387,22 +340,7 @@ if collection and st.button("Ask"):
387
  if route_decision == "HOURS":
388
  st.info("You can find the current library hours here: [https://gc-cuny.libcal.com/hours](https://gc-cuny.libcal.com/hours)")
389
  st.stop()
390
- elif route_decision == "CATALOG_SEARCH":
391
- catalog_url = "https://cuny-gc.primo.exlibrisgroup.com/discovery/search?vid=01CUNY_GC:CUNY_GC"
392
- st.info(f"To check for specific books, journals, or articles, please search the library catalog directly here: [{catalog_url}]({catalog_url})")
393
- st.stop()
394
- elif route_decision == "ILL_REQUEST":
395
- ill_url = "https://ezproxy.gc.cuny.edu/login?url=https://gc-cuny.illiad.oclc.org/illiad/illiad.dll"
396
- st.info(f"For Interlibrary Loan requests or questions, please use the ILL system here: [{ill_url}]({ill_url})")
397
- st.stop()
398
- elif route_decision == "ACCOUNT_INFO":
399
- account_url = "https://cuny-gc.primo.exlibrisgroup.com/discovery/account?vid=01CUNY_GC:CUNY_GC&section=overview"
400
- st.info(f"To manage your library account (renewals, fines, etc.), please log in here: [{account_url}]({account_url})")
401
- st.stop()
402
- elif route_decision == "TECH_SUPPORT":
403
- support_url = "https://docs.google.com/forms/d/e/1FAIpQLSdF3a-Au-jIYRDN-mxU3MpZSANQJWFx0VEN2if01iRucIXsZA/viewform"
404
- st.info(f"To report a problem with accessing e-resources or other technical issues, please use this form: [{support_url}]({support_url})")
405
- st.stop()
406
  elif route_decision == "EVENTS_CALENDAR":
407
  events_url = "https://gc-cuny.libcal.com/calendar?cid=15537&t=d&d=0000-00-00&cal=15537&inc=0"
408
  st.info(f"You can find information about upcoming library events and workshops on the calendar here: [{events_url}]({events_url})")
@@ -417,16 +355,26 @@ if collection and st.button("Ask"):
417
  all_queries = [query] + query_variations
418
  logging.info(f"--- DIAGNOSTIC: All queries for search: {all_queries}")
419
 
420
- # 2. Vector Search (ChromaDB handles query embedding internally)
 
 
 
 
 
 
 
 
 
 
421
  vector_results_ids = []
422
  context_chunks = []
423
  context_metadata_list = []
424
 
425
  try:
426
- logging.info(f"Performing vector search for {len(all_queries)} queries (ChromaDB will embed)...")
427
- # Query ChromaDB using query_texts - it uses the collection's embedding function
428
  vector_results = collection.query(
429
- query_texts=all_queries, # Pass texts, not embeddings
430
  n_results=INITIAL_N_RESULTS,
431
  include=['documents', 'metadatas', 'distances']
432
  )
@@ -491,7 +439,7 @@ if collection and st.button("Ask"):
491
  logging.exception("Vector search/selection failed.")
492
  context_chunks = []
493
 
494
- # 3. Generate Final Prompt based on Route
495
  if route_decision == "RESEARCH_QUERY":
496
  logging.info("Using RESEARCH_QUERY prompt template.")
497
  final_prompt = RESEARCH_QUERY_PROMPT_TEMPLATE.format(context_str="\n\n".join(context_chunks), query=query)
@@ -499,14 +447,14 @@ if collection and st.button("Ask"):
499
  logging.info("Using standard RAG prompt template.")
500
  final_prompt = generate_prompt(query, context_chunks)
501
 
502
- # 4. Query HF Inference API LLM
503
  logging.info(f"Sending final prompt to HF Inference API model: {HF_GENERATION_MODEL}...")
504
  answer = query_hf_inference(final_prompt)
505
  logging.info(f"Received answer from HF Inference API: {answer[:100]}...")
506
  if answer.startswith("Error:"):
507
  st.error(f"Answer generation failed: {answer}")
508
 
509
- # 5. Display results
510
  st.subheader("Answer:")
511
  st.markdown(answer)
512
 
@@ -527,13 +475,14 @@ if collection and st.button("Ask"):
527
  st.sidebar.header("How to Use")
528
  st.sidebar.info(
529
  "1. Ensure your `HUGGING_FACE_HUB_TOKEN` is correctly set as a Space secret (`HF_TOKEN`) or in the `.env` file.\n"
530
- f"2. The app will automatically create/embed the database using `{LOCAL_EMBEDDING_MODEL}` on first run if needed (requires `{INPUT_FILE}` to be present).\n"
 
531
  "3. Enter your question in the text area.\n"
532
  "4. Click 'Ask'."
533
  )
534
  st.sidebar.header("Configuration")
535
- st.sidebar.markdown(f"**Embedding:** Local (`{LOCAL_EMBEDDING_MODEL}` via ChromaDB)")
536
  st.sidebar.markdown(f"**LLM (HF API):** `{HF_GENERATION_MODEL}`")
537
- st.sidebar.markdown(f"**ChromaDB Collection:** `{COLLECTION_NAME}`")
538
  st.sidebar.markdown(f"**Retrieval Mode:** Vector Search Only")
539
  st.sidebar.markdown(f"**Final Unique Chunks:** `{TOP_K}` (from initial `{INITIAL_N_RESULTS}` vector search)")
 
7
  from dotenv import load_dotenv
8
  from huggingface_hub import InferenceClient
9
  import numpy as np
10
+ import time
11
+ from tqdm import tqdm
12
+ # Need datasets, pandas, sentence-transformers
13
+ from datasets import load_dataset, DatasetDict, Dataset
14
+ import pandas as pd
15
+ from sentence_transformers import SentenceTransformer
16
+ # Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
17
+ # import chromadb.utils.embedding_functions as embedding_functions
 
18
 
19
  # --- Configuration ---
20
+ # DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
21
+ COLLECTION_NAME = "libguides_content"
22
+ LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
23
  HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
24
+ HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
25
+ PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
26
+ # INPUT_FILE = 'extracted_content.jsonl' # No longer needed for app runtime
27
+ # EMBEDDING_BATCH_SIZE = 100 # Batch size for adding docs to ChromaDB (now done during load)
28
+ ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
29
+ TOP_K = 10
30
+ INITIAL_N_RESULTS = 50
31
+ API_RETRY_DELAY = 2
32
+ MAX_NEW_TOKENS = 512
33
  # ---
34
 
35
  # Setup logging
36
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
37
 
38
  # --- Load API Key and Initialize HF Generation Client ---
 
39
  @st.cache_resource
40
  def initialize_hf_client():
41
  generation_client_instance = None
42
  try:
43
  load_dotenv()
 
44
  HF_TOKEN = os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
45
  if not HF_TOKEN:
46
+ logging.error("HF_TOKEN or HUGGING_FACE_HUB_TOKEN not found.")
47
+ st.error("🔴 Hugging Face Token not found. Please set it as a Space secret named HF_TOKEN or in the .env file.")
48
+ st.stop()
49
  else:
50
  generation_client_instance = InferenceClient(model=HF_GENERATION_MODEL, token=HF_TOKEN)
51
  logging.info(f"Initialized HF Inference Client for generation ({HF_GENERATION_MODEL}).")
 
53
  except Exception as e:
54
  logging.exception("Error initializing Hugging Face Inference Client for generation.")
55
  st.error(f"🔴 Error initializing Hugging Face Inference Client: {e}")
56
+ st.stop()
57
+ return None
58
 
59
  generation_client = initialize_hf_client()
60
  # ---
61
 
62
+ # --- Load Local Embedding Model (for Queries) ---
 
 
63
  @st.cache_resource
64
+ def load_local_embedding_model():
65
+ logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
66
  try:
67
  import torch
68
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
70
  except ImportError:
71
  device = 'cpu'
72
  logging.info("Torch not found, using device: cpu")
 
73
  try:
74
+ model = SentenceTransformer(LOCAL_EMBEDDING_MODEL, device=device, trust_remote_code=True)
75
+ logging.info("Local embedding model loaded successfully.")
76
+ return model
 
 
 
 
77
  except Exception as e:
78
+ st.error(f"Failed to load local embedding model ({LOCAL_EMBEDDING_MODEL}): {e}")
79
+ logging.exception(f"Failed to load local embedding model: {e}")
80
+ st.stop()
81
+ return None
82
 
83
+ embedding_model = load_local_embedding_model()
84
+ # ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
87
+ @st.cache_resource
88
+ def load_data_and_setup_chroma():
89
+ if not generation_client or not embedding_model:
90
+ st.error("Required clients/models not initialized. Cannot proceed.")
91
+ st.stop()
92
 
93
  try:
94
+ logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
95
+ # Load the dataset - might need split='train' if applicable
96
+ # Handle potential errors during download/load
97
+ try:
98
+ dataset = load_dataset(HF_DATASET_ID, split='train') # Assuming default split is 'train'
99
+ except Exception as load_e:
100
+ logging.error(f"Failed to load dataset '{HF_DATASET_ID}': {load_e}")
101
+ st.error(f"Failed to load dataset '{HF_DATASET_ID}'. Check dataset ID and availability.")
102
+ st.stop()
103
+
104
+ logging.info("Converting dataset to Pandas DataFrame...")
105
+ df = dataset.to_pandas()
106
+ logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
107
+
108
+ # Verify required columns
109
+ required_cols = ['id', 'document', 'embedding', 'metadata']
110
+ if not all(col in df.columns for col in required_cols):
111
+ st.error(f"Dataset is missing required columns. Found: {df.columns}. Required: {required_cols}")
112
+ logging.error(f"Dataset missing required columns. Found: {df.columns}")
113
+ st.stop()
114
+
115
+ # Ensure embeddings are lists of floats (Parquet might store them efficiently)
116
+ # This might not be strictly necessary if ChromaDB handles numpy arrays, but safer to convert
117
+ logging.info("Ensuring embeddings are in list format...")
118
+ df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
119
+ # Drop rows where embedding conversion failed
120
+ initial_rows = len(df)
121
+ df.dropna(subset=['embedding'], inplace=True)
122
+ if len(df) < initial_rows:
123
+ logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
124
+
125
+ if df.empty:
126
+ st.error("No valid data loaded from the dataset after processing embeddings.")
127
+ logging.error("DataFrame empty after embedding processing.")
128
+ st.stop()
129
+
130
+ logging.info("Initializing in-memory ChromaDB client...")
131
+ chroma_client = chromadb.Client() # In-memory client
132
+
133
+ # Delete collection if it somehow exists in memory (unlikely but safe)
134
  try:
135
  chroma_client.delete_collection(name=COLLECTION_NAME)
136
+ except: pass
 
137
 
138
+ logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
139
+ # Create collection WITHOUT embedding function - we provide pre-computed ones
140
  collection = chroma_client.create_collection(
141
  name=COLLECTION_NAME,
142
+ metadata={"hnsw:space": "cosine"} # Or dot if BGE prefers
 
143
  )
 
144
 
145
+ logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
 
146
  start_time = time.time()
 
147
  error_count = 0
148
+ num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
149
+ progress_bar = st.progress(0, text="Loading embeddings into memory...")
150
 
151
  for i in range(num_batches):
152
+ start_idx = i * ADD_BATCH_SIZE
153
+ end_idx = start_idx + ADD_BATCH_SIZE
154
+ batch_df = df.iloc[start_idx:end_idx]
 
 
155
 
156
  try:
157
+ collection.add(
158
+ ids=batch_df['id'].tolist(),
159
+ embeddings=batch_df['embedding'].tolist(),
160
+ documents=batch_df['document'].tolist(),
161
+ metadatas=batch_df['metadata'].tolist()
162
+ )
163
  except Exception as e:
164
+ logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
165
  error_count += 1
166
+ progress_bar.progress((i + 1) / num_batches, text=f"Loading embeddings... Batch {i+1}/{num_batches}")
167
 
168
  progress_bar.empty()
169
  end_time = time.time()
170
+ logging.info(f"Finished loading data into in-memory ChromaDB. Took {end_time - start_time:.2f} seconds.")
 
171
  if error_count > 0:
172
+ logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
 
 
 
 
 
 
 
 
 
173
 
174
+ st.success("Embeddings loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  return collection
176
+
177
+ except ImportError as e:
178
+ st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
179
+ logging.error(f"ImportError during dataset loading/Chroma setup: {e}")
180
+ st.stop()
181
  except Exception as e:
182
+ st.error(f"Failed to load data and initialize ChromaDB: {e}")
183
+ logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
184
+ st.stop()
185
+ return None # Should not be reached
186
 
187
+ # --- Load data and collection ---
188
+ collection = load_data_and_setup_chroma()
189
+ # ---
190
 
191
  # --- Helper Functions ---
192
  def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
193
  """Sends the prompt to the HF Inference API using the initialized client."""
194
  if not client_instance:
195
  client_instance = generation_client
 
196
  if not client_instance:
197
  logging.error("HF Inference client not initialized in query_hf_inference.")
198
  return "Error: HF Inference client failed to initialize."
199
  try:
200
+ response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
 
 
 
201
  if not response_text:
202
  logging.warning(f"Received empty response from HF Inference API ({model_name}) for prompt: {prompt[:100]}...")
203
  return "Error: Received empty response from generation model."
 
227
 
228
  User Query: "{query}"
229
  Output:"""
 
230
  logging.info(f"Generating query variations for: {query} using {model_name}")
231
  try:
232
  response = llm_func(prompt, model_name=model_name)
 
259
  return prompt
260
 
261
  # --- Streamlit App UI ---
262
+ st.set_page_config(layout="wide")
263
+ st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
 
 
 
264
 
265
  # User input (only proceed if collection loaded)
266
  if collection:
267
  query = st.text_area("Enter your question:", height=100)
268
  else:
269
+ # Error handled during load_data_and_setup_chroma
270
+ st.error("Application initialization failed. Cannot proceed.")
271
+ st.stop()
272
 
273
  # --- Routing Prompt Definition ---
274
  ROUTING_PROMPT_TEMPLATE = """You are a query routing assistant for a library chatbot. Your task is to classify the user's query into one of the following categories based on its intent:
 
340
  if route_decision == "HOURS":
341
  st.info("You can find the current library hours here: [https://gc-cuny.libcal.com/hours](https://gc-cuny.libcal.com/hours)")
342
  st.stop()
343
+ # ... (other routes) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  elif route_decision == "EVENTS_CALENDAR":
345
  events_url = "https://gc-cuny.libcal.com/calendar?cid=15537&t=d&d=0000-00-00&cal=15537&inc=0"
346
  st.info(f"You can find information about upcoming library events and workshops on the calendar here: [{events_url}]({events_url})")
 
355
  all_queries = [query] + query_variations
356
  logging.info(f"--- DIAGNOSTIC: All queries for search: {all_queries}")
357
 
358
+ # 2. Embed Queries Locally
359
+ try:
360
+ logging.info(f"Generating query embeddings locally using {LOCAL_EMBEDDING_MODEL}...")
361
+ query_embeddings = embedding_model.encode(all_queries).tolist()
362
+ logging.info(f"Generated {len(query_embeddings)} query embeddings locally.")
363
+ except Exception as e:
364
+ st.error(f"Failed to embed query using local model: {e}")
365
+ logging.exception(f"Failed to embed query using local model: {e}")
366
+ st.stop()
367
+
368
+ # 3. Vector Search (using pre-computed query embeddings)
369
  vector_results_ids = []
370
  context_chunks = []
371
  context_metadata_list = []
372
 
373
  try:
374
+ logging.info(f"Performing vector search for {len(query_embeddings)} embeddings...")
375
+ # Query ChromaDB using the computed query_embeddings
376
  vector_results = collection.query(
377
+ query_embeddings=query_embeddings, # Pass embeddings now
378
  n_results=INITIAL_N_RESULTS,
379
  include=['documents', 'metadatas', 'distances']
380
  )
 
439
  logging.exception("Vector search/selection failed.")
440
  context_chunks = []
441
 
442
+ # 4. Generate Final Prompt based on Route
443
  if route_decision == "RESEARCH_QUERY":
444
  logging.info("Using RESEARCH_QUERY prompt template.")
445
  final_prompt = RESEARCH_QUERY_PROMPT_TEMPLATE.format(context_str="\n\n".join(context_chunks), query=query)
 
447
  logging.info("Using standard RAG prompt template.")
448
  final_prompt = generate_prompt(query, context_chunks)
449
 
450
+ # 5. Query HF Inference API LLM
451
  logging.info(f"Sending final prompt to HF Inference API model: {HF_GENERATION_MODEL}...")
452
  answer = query_hf_inference(final_prompt)
453
  logging.info(f"Received answer from HF Inference API: {answer[:100]}...")
454
  if answer.startswith("Error:"):
455
  st.error(f"Answer generation failed: {answer}")
456
 
457
+ # 6. Display results
458
  st.subheader("Answer:")
459
  st.markdown(answer)
460
 
 
475
  st.sidebar.header("How to Use")
476
  st.sidebar.info(
477
  "1. Ensure your `HUGGING_FACE_HUB_TOKEN` is correctly set as a Space secret (`HF_TOKEN`) or in the `.env` file.\n"
478
+ f"2. The app will load pre-computed embeddings from the HF Dataset (`{HF_DATASET_ID}`).\n"
479
+ " (Ensure the dataset was created correctly using `export_chroma_to_parquet.py` and `upload_dataset_to_hf.py`)\n"
480
  "3. Enter your question in the text area.\n"
481
  "4. Click 'Ask'."
482
  )
483
  st.sidebar.header("Configuration")
484
+ st.sidebar.markdown(f"**Embedding:** Pre-computed (`{LOCAL_EMBEDDING_MODEL}` loaded from HF Dataset)")
485
  st.sidebar.markdown(f"**LLM (HF API):** `{HF_GENERATION_MODEL}`")
486
+ st.sidebar.markdown(f"**ChromaDB Collection:** `{COLLECTION_NAME}` (In-Memory)")
487
  st.sidebar.markdown(f"**Retrieval Mode:** Vector Search Only")
488
  st.sidebar.markdown(f"**Final Unique Chunks:** `{TOP_K}` (from initial `{INITIAL_N_RESULTS}` vector search)")