Spaces:
Sleeping
Sleeping
File size: 4,746 Bytes
890d952 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
#!/usr/bin/env python3
"""
Test script to explore all metadata fields available in the FAISS database chunks.
"""
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Configuration
FAISS_INDEX_PATH = "faiss_index"
EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
def explore_metadata():
"""Explore all metadata fields available in the database chunks."""
print("EXPLORING METADATA IN FAISS DATABASE")
print("=" * 60)
if not os.path.exists(FAISS_INDEX_PATH):
print(f"β Error: FAISS index not found at {FAISS_INDEX_PATH}")
return False
try:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
print(f"β
Successfully loaded FAISS index from {FAISS_INDEX_PATH}")
except Exception as e:
print(f"β Error loading FAISS index: {e}")
return False
# Get a sample of documents to analyze metadata
sample_queries = [
"Ethernet Interfaces Summary",
"UNUSED",
"interface configuration",
"device information",
"fabric"
]
all_metadata_keys = set()
metadata_examples = {}
print("\nSampling documents to analyze metadata...")
print("-" * 40)
for query in sample_queries:
try:
results = vector_db.similarity_search_with_score(query, k=3)
for doc, score in results:
if doc.metadata:
# Collect all metadata keys
all_metadata_keys.update(doc.metadata.keys())
# Store examples of each metadata field
for key, value in doc.metadata.items():
if key not in metadata_examples:
metadata_examples[key] = []
if value not in metadata_examples[key]:
metadata_examples[key].append(value)
except Exception as e:
print(f"Error with query '{query}': {e}")
# Display metadata analysis
print(f"\nπ METADATA ANALYSIS")
print("=" * 60)
print(f"Total unique metadata keys found: {len(all_metadata_keys)}")
print(f"Metadata keys: {sorted(all_metadata_keys)}")
print(f"\nπ DETAILED METADATA FIELDS:")
print("-" * 40)
for key in sorted(all_metadata_keys):
examples = metadata_examples.get(key, [])
print(f"\nπ Field: '{key}'")
print(f" Unique values found: {len(examples)}")
print(f" Example values:")
for i, example in enumerate(examples[:5]): # Show max 5 examples
print(f" {i+1}: {repr(example)}")
if len(examples) > 5:
print(f" ... and {len(examples) - 5} more")
# Show some detailed examples
print(f"\nπ SAMPLE DOCUMENTS WITH FULL METADATA:")
print("-" * 40)
# Get a few documents to show complete metadata
sample_results = vector_db.similarity_search_with_score("Ethernet", k=3)
for i, (doc, score) in enumerate(sample_results):
print(f"\n[SAMPLE {i+1}]")
print(f"Score: {score:.4f}")
print(f"Content Length: {len(doc.page_content)} characters")
print(f"Content Preview: {doc.page_content[:100].replace(chr(10), ' ')}...")
print(f"Complete Metadata:")
if doc.metadata:
for key, value in sorted(doc.metadata.items()):
print(f" {key}: {repr(value)}")
else:
print(" No metadata found")
print("-" * 30)
# Analysis summary
print(f"\nπ SUMMARY:")
print("=" * 60)
device_docs = len([ex for ex in metadata_examples.get('device_name', []) if ex])
source_files = len(metadata_examples.get('source', []))
print(f"β’ Device documents found: {device_docs}")
print(f"β’ Source files found: {source_files}")
if 'device_name' in all_metadata_keys:
print(f"β’ Device names: {metadata_examples.get('device_name', [])}")
if 'source' in all_metadata_keys:
print(f"β’ Source file types: {set(f.split('.')[-1] if '.' in f else 'unknown' for f in metadata_examples.get('source', []))}")
return True
def main():
"""Run the metadata exploration."""
success = explore_metadata()
if success:
print("\nβ
Metadata exploration completed successfully!")
return 0
else:
print("\nβ Metadata exploration failed")
return 1
if __name__ == "__main__":
exit(main())
|