#!/usr/bin/env python3 """ Test script to explore all metadata fields available in the FAISS database chunks. """ import os from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS # Configuration FAISS_INDEX_PATH = "faiss_index" EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" def explore_metadata(): """Explore all metadata fields available in the database chunks.""" print("EXPLORING METADATA IN FAISS DATABASE") print("=" * 60) if not os.path.exists(FAISS_INDEX_PATH): print(f"āŒ Error: FAISS index not found at {FAISS_INDEX_PATH}") return False try: embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME) vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) print(f"āœ… Successfully loaded FAISS index from {FAISS_INDEX_PATH}") except Exception as e: print(f"āŒ Error loading FAISS index: {e}") return False # Get a sample of documents to analyze metadata sample_queries = [ "Ethernet Interfaces Summary", "UNUSED", "interface configuration", "device information", "fabric" ] all_metadata_keys = set() metadata_examples = {} print("\nSampling documents to analyze metadata...") print("-" * 40) for query in sample_queries: try: results = vector_db.similarity_search_with_score(query, k=3) for doc, score in results: if doc.metadata: # Collect all metadata keys all_metadata_keys.update(doc.metadata.keys()) # Store examples of each metadata field for key, value in doc.metadata.items(): if key not in metadata_examples: metadata_examples[key] = [] if value not in metadata_examples[key]: metadata_examples[key].append(value) except Exception as e: print(f"Error with query '{query}': {e}") # Display metadata analysis print(f"\nšŸ” METADATA ANALYSIS") print("=" * 60) print(f"Total unique metadata keys found: {len(all_metadata_keys)}") print(f"Metadata keys: {sorted(all_metadata_keys)}") print(f"\nšŸ“‹ DETAILED METADATA FIELDS:") print("-" * 40) for key in sorted(all_metadata_keys): examples = metadata_examples.get(key, []) print(f"\nšŸ”‘ Field: '{key}'") print(f" Unique values found: {len(examples)}") print(f" Example values:") for i, example in enumerate(examples[:5]): # Show max 5 examples print(f" {i+1}: {repr(example)}") if len(examples) > 5: print(f" ... and {len(examples) - 5} more") # Show some detailed examples print(f"\nšŸ“„ SAMPLE DOCUMENTS WITH FULL METADATA:") print("-" * 40) # Get a few documents to show complete metadata sample_results = vector_db.similarity_search_with_score("Ethernet", k=3) for i, (doc, score) in enumerate(sample_results): print(f"\n[SAMPLE {i+1}]") print(f"Score: {score:.4f}") print(f"Content Length: {len(doc.page_content)} characters") print(f"Content Preview: {doc.page_content[:100].replace(chr(10), ' ')}...") print(f"Complete Metadata:") if doc.metadata: for key, value in sorted(doc.metadata.items()): print(f" {key}: {repr(value)}") else: print(" No metadata found") print("-" * 30) # Analysis summary print(f"\nšŸ“Š SUMMARY:") print("=" * 60) device_docs = len([ex for ex in metadata_examples.get('device_name', []) if ex]) source_files = len(metadata_examples.get('source', [])) print(f"• Device documents found: {device_docs}") print(f"• Source files found: {source_files}") if 'device_name' in all_metadata_keys: print(f"• Device names: {metadata_examples.get('device_name', [])}") if 'source' in all_metadata_keys: print(f"• Source file types: {set(f.split('.')[-1] if '.' in f else 'unknown' for f in metadata_examples.get('source', []))}") return True def main(): """Run the metadata exploration.""" success = explore_metadata() if success: print("\nāœ… Metadata exploration completed successfully!") return 0 else: print("\nāŒ Metadata exploration failed") return 1 if __name__ == "__main__": exit(main())