File size: 4,746 Bytes
890d952
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""
Test script to explore all metadata fields available in the FAISS database chunks.
"""

import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Configuration
FAISS_INDEX_PATH = "faiss_index"
EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

def explore_metadata():
    """Explore all metadata fields available in the database chunks."""
    print("EXPLORING METADATA IN FAISS DATABASE")
    print("=" * 60)
    
    if not os.path.exists(FAISS_INDEX_PATH):
        print(f"❌ Error: FAISS index not found at {FAISS_INDEX_PATH}")
        return False
    
    try:
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
        vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        print(f"βœ… Successfully loaded FAISS index from {FAISS_INDEX_PATH}")
    except Exception as e:
        print(f"❌ Error loading FAISS index: {e}")
        return False
    
    # Get a sample of documents to analyze metadata
    sample_queries = [
        "Ethernet Interfaces Summary",
        "UNUSED",
        "interface configuration",
        "device information",
        "fabric"
    ]
    
    all_metadata_keys = set()
    metadata_examples = {}
    
    print("\nSampling documents to analyze metadata...")
    print("-" * 40)
    
    for query in sample_queries:
        try:
            results = vector_db.similarity_search_with_score(query, k=3)
            
            for doc, score in results:
                if doc.metadata:
                    # Collect all metadata keys
                    all_metadata_keys.update(doc.metadata.keys())
                    
                    # Store examples of each metadata field
                    for key, value in doc.metadata.items():
                        if key not in metadata_examples:
                            metadata_examples[key] = []
                        if value not in metadata_examples[key]:
                            metadata_examples[key].append(value)
                            
        except Exception as e:
            print(f"Error with query '{query}': {e}")
    
    # Display metadata analysis
    print(f"\nπŸ” METADATA ANALYSIS")
    print("=" * 60)
    print(f"Total unique metadata keys found: {len(all_metadata_keys)}")
    print(f"Metadata keys: {sorted(all_metadata_keys)}")
    
    print(f"\nπŸ“‹ DETAILED METADATA FIELDS:")
    print("-" * 40)
    
    for key in sorted(all_metadata_keys):
        examples = metadata_examples.get(key, [])
        print(f"\nπŸ”‘ Field: '{key}'")
        print(f"   Unique values found: {len(examples)}")
        print(f"   Example values:")
        for i, example in enumerate(examples[:5]):  # Show max 5 examples
            print(f"     {i+1}: {repr(example)}")
        if len(examples) > 5:
            print(f"     ... and {len(examples) - 5} more")
    
    # Show some detailed examples
    print(f"\nπŸ“„ SAMPLE DOCUMENTS WITH FULL METADATA:")
    print("-" * 40)
    
    # Get a few documents to show complete metadata
    sample_results = vector_db.similarity_search_with_score("Ethernet", k=3)
    
    for i, (doc, score) in enumerate(sample_results):
        print(f"\n[SAMPLE {i+1}]")
        print(f"Score: {score:.4f}")
        print(f"Content Length: {len(doc.page_content)} characters")
        print(f"Content Preview: {doc.page_content[:100].replace(chr(10), ' ')}...")
        print(f"Complete Metadata:")
        if doc.metadata:
            for key, value in sorted(doc.metadata.items()):
                print(f"  {key}: {repr(value)}")
        else:
            print("  No metadata found")
        print("-" * 30)
    
    # Analysis summary
    print(f"\nπŸ“Š SUMMARY:")
    print("=" * 60)
    
    device_docs = len([ex for ex in metadata_examples.get('device_name', []) if ex])
    source_files = len(metadata_examples.get('source', []))
    
    print(f"β€’ Device documents found: {device_docs}")
    print(f"β€’ Source files found: {source_files}")
    
    if 'device_name' in all_metadata_keys:
        print(f"β€’ Device names: {metadata_examples.get('device_name', [])}")
    
    if 'source' in all_metadata_keys:
        print(f"β€’ Source file types: {set(f.split('.')[-1] if '.' in f else 'unknown' for f in metadata_examples.get('source', []))}")
    
    return True

def main():
    """Run the metadata exploration."""
    success = explore_metadata()
    
    if success:
        print("\nβœ… Metadata exploration completed successfully!")
        return 0
    else:
        print("\n❌ Metadata exploration failed")
        return 1

if __name__ == "__main__":
    exit(main())