Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Test script to explore all metadata fields available in the FAISS database chunks. | |
""" | |
import os | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
# Configuration | |
FAISS_INDEX_PATH = "faiss_index" | |
EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
def explore_metadata(): | |
"""Explore all metadata fields available in the database chunks.""" | |
print("EXPLORING METADATA IN FAISS DATABASE") | |
print("=" * 60) | |
if not os.path.exists(FAISS_INDEX_PATH): | |
print(f"β Error: FAISS index not found at {FAISS_INDEX_PATH}") | |
return False | |
try: | |
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME) | |
vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True) | |
print(f"β Successfully loaded FAISS index from {FAISS_INDEX_PATH}") | |
except Exception as e: | |
print(f"β Error loading FAISS index: {e}") | |
return False | |
# Get a sample of documents to analyze metadata | |
sample_queries = [ | |
"Ethernet Interfaces Summary", | |
"UNUSED", | |
"interface configuration", | |
"device information", | |
"fabric" | |
] | |
all_metadata_keys = set() | |
metadata_examples = {} | |
print("\nSampling documents to analyze metadata...") | |
print("-" * 40) | |
for query in sample_queries: | |
try: | |
results = vector_db.similarity_search_with_score(query, k=3) | |
for doc, score in results: | |
if doc.metadata: | |
# Collect all metadata keys | |
all_metadata_keys.update(doc.metadata.keys()) | |
# Store examples of each metadata field | |
for key, value in doc.metadata.items(): | |
if key not in metadata_examples: | |
metadata_examples[key] = [] | |
if value not in metadata_examples[key]: | |
metadata_examples[key].append(value) | |
except Exception as e: | |
print(f"Error with query '{query}': {e}") | |
# Display metadata analysis | |
print(f"\nπ METADATA ANALYSIS") | |
print("=" * 60) | |
print(f"Total unique metadata keys found: {len(all_metadata_keys)}") | |
print(f"Metadata keys: {sorted(all_metadata_keys)}") | |
print(f"\nπ DETAILED METADATA FIELDS:") | |
print("-" * 40) | |
for key in sorted(all_metadata_keys): | |
examples = metadata_examples.get(key, []) | |
print(f"\nπ Field: '{key}'") | |
print(f" Unique values found: {len(examples)}") | |
print(f" Example values:") | |
for i, example in enumerate(examples[:5]): # Show max 5 examples | |
print(f" {i+1}: {repr(example)}") | |
if len(examples) > 5: | |
print(f" ... and {len(examples) - 5} more") | |
# Show some detailed examples | |
print(f"\nπ SAMPLE DOCUMENTS WITH FULL METADATA:") | |
print("-" * 40) | |
# Get a few documents to show complete metadata | |
sample_results = vector_db.similarity_search_with_score("Ethernet", k=3) | |
for i, (doc, score) in enumerate(sample_results): | |
print(f"\n[SAMPLE {i+1}]") | |
print(f"Score: {score:.4f}") | |
print(f"Content Length: {len(doc.page_content)} characters") | |
print(f"Content Preview: {doc.page_content[:100].replace(chr(10), ' ')}...") | |
print(f"Complete Metadata:") | |
if doc.metadata: | |
for key, value in sorted(doc.metadata.items()): | |
print(f" {key}: {repr(value)}") | |
else: | |
print(" No metadata found") | |
print("-" * 30) | |
# Analysis summary | |
print(f"\nπ SUMMARY:") | |
print("=" * 60) | |
device_docs = len([ex for ex in metadata_examples.get('device_name', []) if ex]) | |
source_files = len(metadata_examples.get('source', [])) | |
print(f"β’ Device documents found: {device_docs}") | |
print(f"β’ Source files found: {source_files}") | |
if 'device_name' in all_metadata_keys: | |
print(f"β’ Device names: {metadata_examples.get('device_name', [])}") | |
if 'source' in all_metadata_keys: | |
print(f"β’ Source file types: {set(f.split('.')[-1] if '.' in f else 'unknown' for f in metadata_examples.get('source', []))}") | |
return True | |
def main(): | |
"""Run the metadata exploration.""" | |
success = explore_metadata() | |
if success: | |
print("\nβ Metadata exploration completed successfully!") | |
return 0 | |
else: | |
print("\nβ Metadata exploration failed") | |
return 1 | |
if __name__ == "__main__": | |
exit(main()) | |