Spaces:

Zwounds
/

LibraryRAG

Sleeping

App Files Files Community

Zwounds commited on Mar 31

Commit

5b66564

verified ·

1 Parent(s): cab221e

Upload app.py

Browse files

Files changed (1) hide show

app.py +45 -21

app.py CHANGED Viewed

@@ -16,6 +16,10 @@ from sentence_transformers import SentenceTransformer
 # Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
 # import chromadb.utils.embedding_functions as embedding_functions
 # --- Configuration ---
 # DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
 COLLECTION_NAME = "libguides_content"
@@ -86,39 +90,45 @@ embedding_model = load_local_embedding_model()
 # --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
 @st.cache_resource
 def load_data_and_setup_chroma():
     if not generation_client or not embedding_model:
          st.error("Required clients/models not initialized. Cannot proceed.")
          st.stop()
     try:
         logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
-        # Load the dataset - might need split='train' if applicable
-        # Handle potential errors during download/load
         try:
-            dataset = load_dataset(HF_DATASET_ID, split='train') # Assuming default split is 'train'
-        except Exception as load_e:
-             logging.error(f"Failed to load dataset '{HF_DATASET_ID}': {load_e}")
-             st.error(f"Failed to load dataset '{HF_DATASET_ID}'. Check dataset ID and availability.")
              st.stop()
-        logging.info("Converting dataset to Pandas DataFrame...")
-        df = dataset.to_pandas()
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
         # Verify required columns
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
-            st.error(f"Dataset is missing required columns. Found: {df.columns}. Required: {required_cols}")
-            logging.error(f"Dataset missing required columns. Found: {df.columns}")
             st.stop()
-        # Ensure embeddings are lists of floats (Parquet might store them efficiently)
-        # This might not be strictly necessary if ChromaDB handles numpy arrays, but safer to convert
         logging.info("Ensuring embeddings are in list format...")
-        df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
-        # Drop rows where embedding conversion failed
         initial_rows = len(df)
-        df.dropna(subset=['embedding'], inplace=True)
         if len(df) < initial_rows:
             logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
@@ -130,16 +140,16 @@ def load_data_and_setup_chroma():
         logging.info("Initializing in-memory ChromaDB client...")
         chroma_client = chromadb.Client() # In-memory client
-        # Delete collection if it somehow exists in memory (unlikely but safe)
         try:
             chroma_client.delete_collection(name=COLLECTION_NAME)
         except: pass
         logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
-        # Create collection WITHOUT embedding function - we provide pre-computed ones
         collection = chroma_client.create_collection(
             name=COLLECTION_NAME,
-            metadata={"hnsw:space": "cosine"} # Or dot if BGE prefers
         )
         logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
@@ -154,11 +164,26 @@ def load_data_and_setup_chroma():
             batch_df = df.iloc[start_idx:end_idx]
             try:
                 collection.add(
                     ids=batch_df['id'].tolist(),
                     embeddings=batch_df['embedding'].tolist(),
                     documents=batch_df['document'].tolist(),
-                    metadatas=batch_df['metadata'].tolist()
                 )
             except Exception as e:
                 logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
@@ -182,7 +207,7 @@ def load_data_and_setup_chroma():
         st.error(f"Failed to load data and initialize ChromaDB: {e}")
         logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
         st.stop()
-    return None # Should not be reached
 # --- Load data and collection ---
 collection = load_data_and_setup_chroma()
@@ -259,7 +284,6 @@ Answer:"""
     return prompt
 # --- Streamlit App UI ---
-st.set_page_config(layout="wide")
 st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
 # User input (only proceed if collection loaded)

 # Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
 # import chromadb.utils.embedding_functions as embedding_functions
+# --- Page Config (MUST BE FIRST Streamlit call) ---
+st.set_page_config(layout="wide")
+# ---
 # --- Configuration ---
 # DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
 COLLECTION_NAME = "libguides_content"
 # --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
 @st.cache_resource
 def load_data_and_setup_chroma():
+    # Ensure dependent resources are loaded first
     if not generation_client or not embedding_model:
          st.error("Required clients/models not initialized. Cannot proceed.")
+         # Potentially redundant with individual init checks, but safe
          st.stop()
     try:
         logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
+        # Download the specific parquet file from the dataset repo
         try:
+            parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
+            logging.info(f"Downloaded dataset file to: {parquet_path}")
+        except Exception as download_e:
+             logging.error(f"Failed to download dataset file '{PARQUET_FILENAME}' from '{HF_DATASET_ID}': {download_e}")
+             st.error(f"Failed to download dataset '{HF_DATASET_ID}'. Check dataset ID, filename, and token permissions.")
              st.stop()
+        logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
+        df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
         # Verify required columns
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
+            st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
+            logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
             st.stop()
+        # Ensure embeddings are lists of floats
         logging.info("Ensuring embeddings are in list format...")
+        # Check if the first embedding is already a list of floats, otherwise convert
+        if not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float):
+             df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
+             logging.info("Converted embeddings to list[float].")
+        else:
+             logging.info("Embeddings already seem to be in list[float] format.")
         initial_rows = len(df)
+        df.dropna(subset=['embedding'], inplace=True) # Drop rows where embedding is None
         if len(df) < initial_rows:
             logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
         logging.info("Initializing in-memory ChromaDB client...")
         chroma_client = chromadb.Client() # In-memory client
         try:
             chroma_client.delete_collection(name=COLLECTION_NAME)
+            logging.info(f"Deleted existing in-memory collection (if any): {COLLECTION_NAME}")
         except: pass
         logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
+        # Create collection WITHOUT embedding function
         collection = chroma_client.create_collection(
             name=COLLECTION_NAME,
+            metadata={"hnsw:space": "cosine"}
         )
         logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
             batch_df = df.iloc[start_idx:end_idx]
             try:
+                # Convert metadata column if it contains dicts
+                metadatas_list = batch_df['metadata'].tolist()
+                if metadatas_list and isinstance(metadatas_list[0], dict):
+                     pass # Already list of dicts
+                else:
+                     # Attempt to parse if they are JSON strings, otherwise use empty dicts
+                     parsed_metadatas = []
+                     for item in metadatas_list:
+                         try:
+                             parsed = json.loads(item) if isinstance(item, str) else item
+                             parsed_metadatas.append(parsed if isinstance(parsed, dict) else {})
+                         except:
+                             parsed_metadatas.append({})
+                     metadatas_list = parsed_metadatas
                 collection.add(
                     ids=batch_df['id'].tolist(),
                     embeddings=batch_df['embedding'].tolist(),
                     documents=batch_df['document'].tolist(),
+                    metadatas=metadatas_list
                 )
             except Exception as e:
                 logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
         st.error(f"Failed to load data and initialize ChromaDB: {e}")
         logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
         st.stop()
+    return None
 # --- Load data and collection ---
 collection = load_data_and_setup_chroma()
     return prompt
 # --- Streamlit App UI ---
 st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
 # User input (only proceed if collection loaded)