Spaces:

Zwounds
/

LibraryRAG

Sleeping

App Files Files Community

Zwounds commited on Apr 1

Commit

c51456e

verified ·

1 Parent(s): d51ec77

Upload app.py

Browse files

Files changed (1) hide show

app.py +37 -36

app.py CHANGED Viewed

@@ -93,12 +93,10 @@ def load_data_and_setup_chroma():
     # Ensure dependent resources are loaded first
     if not generation_client or not embedding_model:
          st.error("Required clients/models not initialized. Cannot proceed.")
-         # Potentially redundant with individual init checks, but safe
          st.stop()
     try:
         logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
-        # Download the specific parquet file from the dataset repo
         try:
             parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
             logging.info(f"Downloaded dataset file to: {parquet_path}")
@@ -111,24 +109,21 @@ def load_data_and_setup_chroma():
         df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
-        # Verify required columns
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
             st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
             logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
             st.stop()
-        # Ensure embeddings are lists of floats
         logging.info("Ensuring embeddings are in list format...")
-        # Check if the first embedding is already a list of floats, otherwise convert
-        if not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float):
              df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
              logging.info("Converted embeddings to list[float].")
         else:
-             logging.info("Embeddings already seem to be in list[float] format.")
         initial_rows = len(df)
-        df.dropna(subset=['embedding'], inplace=True) # Drop rows where embedding is None
         if len(df) < initial_rows:
             logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
@@ -138,7 +133,13 @@ def load_data_and_setup_chroma():
             st.stop()
         logging.info("Initializing in-memory ChromaDB client...")
-        chroma_client = chromadb.Client() # In-memory client
         try:
             chroma_client.delete_collection(name=COLLECTION_NAME)
@@ -146,7 +147,6 @@ def load_data_and_setup_chroma():
         except: pass
         logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
-        # Create collection WITHOUT embedding function
         collection = chroma_client.create_collection(
             name=COLLECTION_NAME,
             metadata={"hnsw:space": "cosine"}
@@ -164,47 +164,41 @@ def load_data_and_setup_chroma():
             batch_df = df.iloc[start_idx:end_idx]
             try:
-                # Convert metadata column if it contains dicts
-                metadatas_list = batch_df['metadata'].tolist()
-                if metadatas_list and isinstance(metadatas_list[0], dict):
-                     pass # Already list of dicts
-                else:
-                     # Attempt to parse if they are JSON strings, otherwise use empty dicts
-                     parsed_metadatas = []
-                     for item in metadatas_list:
-                         try:
-                             parsed = json.loads(item) if isinstance(item, str) else item
-                             parsed_metadatas.append(parsed if isinstance(parsed, dict) else {})
-                         except:
-                              parsed_metadatas.append({})
-                     metadatas_list = parsed_metadatas # This line has the wrong indentation
-                # --- Clean None values from metadata ---
                 cleaned_metadatas = []
-                for meta_dict in metadatas_list:
                     cleaned_dict = {}
-                    if isinstance(meta_dict, dict):
-                        for key, value in meta_dict.items():
-                            # Replace None with empty string, keep other valid types
                             if value is None:
-                                cleaned_dict[key] = ""
                             elif isinstance(value, (str, int, float, bool)):
-                                cleaned_dict[key] = value
                             else:
-                                # Attempt to convert other types to string, or skip
-                                try:
                                     cleaned_dict[key] = str(value)
                                     logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
                                 except:
                                     logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
                     cleaned_metadatas.append(cleaned_dict)
-                # -----------------------------------------
                 collection.add(
                     ids=batch_df['id'].tolist(),
                     embeddings=batch_df['embedding'].tolist(),
                     documents=batch_df['document'].tolist(),
-                    metadatas=cleaned_metadatas # Use cleaned list
                 )
             except Exception as e:
                 logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
@@ -217,6 +211,13 @@ def load_data_and_setup_chroma():
         if error_count > 0:
             logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
         st.success("Embeddings loaded successfully!")
         return collection

     # Ensure dependent resources are loaded first
     if not generation_client or not embedding_model:
          st.error("Required clients/models not initialized. Cannot proceed.")
          st.stop()
     try:
         logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
         try:
             parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
             logging.info(f"Downloaded dataset file to: {parquet_path}")
         df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
             st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
             logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
             st.stop()
         logging.info("Ensuring embeddings are in list format...")
+        if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
              df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
              logging.info("Converted embeddings to list[float].")
         else:
+             logging.info("Embeddings already seem to be in list[float] format or DataFrame is empty.")
         initial_rows = len(df)
+        df.dropna(subset=['embedding'], inplace=True)
         if len(df) < initial_rows:
             logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
             st.stop()
         logging.info("Initializing in-memory ChromaDB client...")
+        # Explicitly configure for in-memory using DuckDB+Parquet
+        settings = chromadb.config.Settings(
+            chroma_api_impl="local",
+            chroma_db_impl="duckdb+parquet",
+            persist_directory=None # Ensure no persistence is attempted
+        )
+        chroma_client = chromadb.Client(settings=settings)
         try:
             chroma_client.delete_collection(name=COLLECTION_NAME)
         except: pass
         logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
         collection = chroma_client.create_collection(
             name=COLLECTION_NAME,
             metadata={"hnsw:space": "cosine"}
             batch_df = df.iloc[start_idx:end_idx]
             try:
+                # Prepare metadata for the batch
+                metadatas_list_raw = batch_df['metadata'].tolist()
                 cleaned_metadatas = []
+                for item in metadatas_list_raw:
                     cleaned_dict = {}
+                    # Handle potential non-dict items loaded from parquet/dataset
+                    if isinstance(item, dict):
+                        current_meta = item
+                    else:
+                        try: # Attempt to parse if it's a JSON string
+                            current_meta = json.loads(item) if isinstance(item, str) else {}
+                        except:
+                            current_meta = {} # Default to empty dict if not dict or valid JSON
+                    # Clean None values within the dictionary
+                    if isinstance(current_meta, dict):
+                        for key, value in current_meta.items():
                             if value is None:
+                                cleaned_dict[key] = "" # Replace None with empty string
                             elif isinstance(value, (str, int, float, bool)):
+                                cleaned_dict[key] = value # Keep allowed types
                             else:
+                                try: # Attempt to convert others to string
                                     cleaned_dict[key] = str(value)
                                     logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
                                 except:
                                     logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
                     cleaned_metadatas.append(cleaned_dict)
+                # Add the batch with cleaned metadata
                 collection.add(
                     ids=batch_df['id'].tolist(),
                     embeddings=batch_df['embedding'].tolist(),
                     documents=batch_df['document'].tolist(),
+                    metadatas=cleaned_metadatas # Use the cleaned list
                 )
             except Exception as e:
                 logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
         if error_count > 0:
             logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
+        # Verify count after adding
+        final_count = collection.count()
+        logging.info(f"Final document count in Chroma collection: {final_count}")
+        if final_count == 0 and len(df) > 0:
+             st.warning("ChromaDB collection is empty after attempting to add documents. Check logs for errors.")
+             # Don't necessarily stop, but warn the user.
         st.success("Embeddings loaded successfully!")
         return collection