Spaces:

rogerscuall
/

chat-with-avd-doc

Sleeping

App Files Files Community

rogerscuall commited on May 22

Commit

eb4910e

verified ·

1 Parent(s): d243e59

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +1 -0
.gitignore +2 -1
README.md +4 -0
app.py +37 -50
faiss_index/index.faiss +3 -0
faiss_index/index.pkl +3 -0
loader.py +123 -0
requirements.txt +5 -1
test_loader.py +287 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 vector_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -117,4 +117,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
-.venv

 # Pyre type checker
 .pyre/
+.venv
+documentation

README.md CHANGED Viewed

@@ -138,3 +138,7 @@ Contributions are welcome! Please feel free to submit a Pull Request.
 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
 4. Push to the branch (`git push origin feature/amazing-feature`)
 5. Open a Pull Request

 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
 4. Push to the branch (`git push origin feature/amazing-feature`)
 5. Open a Pull Request
+## Test Questions
+1. What is the name server in this network? 8.8.8.8

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
 # /// script
 # dependencies = [
 #     "PyYAML",
-#     "chromadb",
-#     "sentence-transformers",
 #     "smolagents",
 #     "gradio",
 #     "einops",
 #     "smolagents[litellm]",
 # ]
 # ///
@@ -28,13 +31,11 @@ with open("prompts.yaml", 'r') as stream:
 # trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))
 # SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
-import chromadb
-from sentence_transformers import SentenceTransformer
-db_name = "vector_db"
-EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1"
-model_embeding = SentenceTransformer(EMBEDDING_MODEL_NAME, trust_remote_code=True)
-client = chromadb.PersistentClient(path=db_name)
 from smolagents import Tool
@@ -49,55 +50,41 @@ class RetrieverTool(Tool):
     }
     output_type = "string"
-    # def __init__(self, **kwargs):
-    #     super().__init__(**kwargs)
-    #     self.collection = client.get_or_create_collection('fabric')
     def forward(self, query: str) -> str:
         assert isinstance(query, str), "Your search query must be a string"
-        client = chromadb.PersistentClient(path=db_name)
-        collection = client.get_or_create_collection('fabric')
-        result1 = collection.get(include=['embeddings', 'metadatas'], limit=5000)
-        print("Number of results:", len(result1['embeddings']))
-        query_vector = model_embeding.encode(query)
-        results = collection.query(
-            query_embeddings=[query_vector],
-            n_results=10,
-            include=["metadatas", "documents"]
-        )
         response = ""
-        for i in range(len(results['documents'][0])):
-            device = self.device(results['metadatas'][0][i]['source'])
-            if device == "global":
-                response += f"Global: {results['metadatas'][0][i]['source']}\n"
             else:
-                response += f"Device: {device}\n"
-            response += f"Result: {results['documents'][0][i]}\n"
-        print("Results:", results)
         return response
-    def device(self, value):
-        """
-        This method return the name of the device if the data belongs to a device if not is global.
-        Args:
-            value: Source of the metadata.
-        Returns:
-            str: The name of the device.
-        """
-        if not value:
-            return "global"
-        if "/devices/" not in value:
-            return "global"
-        parts = value.split("/devices/")
-        if len(parts) != 2:
-            return "global"
-        device_name = parts[1].replace(".md", "")
-        return device_name
-import yaml
-with open("prompts.yaml", 'r') as stream:
-    prompt_templates = yaml.safe_load(stream)
 retriever_tool = RetrieverTool()
 from smolagents import CodeAgent, HfApiModel, LiteLLMModel

 # /// script
 # dependencies = [
 #     "PyYAML",
+#     "langchain-community", # For FAISS, HuggingFaceEmbeddings
+#     "langchain",           # Core Langchain
+#     "faiss-cpu",           # FAISS vector store
+#     "sentence-transformers", # For HuggingFaceEmbeddings
 #     "smolagents",
 #     "gradio",
 #     "einops",
 #     "smolagents[litellm]",
+#     # "unstructured" # Required by loader.py, not directly by app.py but good for environment consistency
 # ]
 # ///
 # trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))
 # SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+FAISS_INDEX_PATH = "faiss_index"
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Must match loader.py
 from smolagents import Tool
     }
     output_type = "string"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+        # allow_dangerous_deserialization is recommended for FAISS indexes saved by Langchain
+        self.db = FAISS.load_local(
+            FAISS_INDEX_PATH,
+            self.embeddings,
+            allow_dangerous_deserialization=True
+        )
     def forward(self, query: str) -> str:
         assert isinstance(query, str), "Your search query must be a string"
+        results_with_scores = self.db.similarity_search_with_score(query, k=10)
         response = ""
+        if not results_with_scores:
+            return "No relevant information found in the documentation for your query."
+        for doc, score in results_with_scores:
+            device_name = doc.metadata.get('device_name')
+            source = doc.metadata.get('source', 'Unknown source')
+            if device_name:
+                response += f"Device: {device_name} (Source: {source}, Score: {score:.4f})\n"
             else:
+                # If not device_name, assume it's global/fabric information
+                response += f"Global/Fabric Info (Source: {source}, Score: {score:.4f})\n"
+            response += f"Result: {doc.page_content}\n\n"
+        print(f"Retrieved {len(results_with_scores)} results for query: '{query}'")
+        # print("Full response:\n", response) # For debugging if needed
         return response
+    # The 'device' method is removed as 'device_name' is now directly in metadata.
 retriever_tool = RetrieverTool()
 from smolagents import CodeAgent, HfApiModel, LiteLLMModel

faiss_index/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:828be1f0d0f7a1249982a3858640ea1164e27a55a68ec7cece4a39ea502c375d
+size 347181

faiss_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f95e3a8e6e86b6c1df0cc92b569424dbeda0061a2081e21f107156921c10898b
+size 183933

loader.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""
+Improved loader script for creating FAISS vector database from Markdown documentation.
+"""
+import os
+from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+# Define the paths to your documentation folders
+DOCS_DIR = "documentation"
+DEVICE_DOCS_PATH = os.path.join(DOCS_DIR, "devices")
+FABRIC_DOCS_PATH = os.path.join(DOCS_DIR, "fabric")
+FAISS_INDEX_PATH = "faiss_index"
+def load_markdown_documents(file_paths):
+    """
+    Loads markdown documents from a list of file paths.
+    The filename is stored in the metadata of each document under the 'source' key.
+    Device name is stored in metadata if applicable.
+    """
+    docs = []
+    for file_path in file_paths:
+        loader = UnstructuredMarkdownLoader(file_path)
+        loaded_docs = loader.load()
+        for doc in loaded_docs:
+            # Ensure metadata is initialized
+            if doc.metadata is None:
+                doc.metadata = {}
+            # Add filename to metadata
+            doc.metadata['source'] = os.path.basename(file_path)
+            # Add device_name to metadata if it's a device file
+            if 'DCX-' in os.path.basename(file_path):
+                doc.metadata['device_name'] = os.path.basename(file_path).replace('.md', '')
+            # Removed device name prepending from here
+        docs.extend(loaded_docs)
+    return docs
+def create_vector_db():
+    """
+    Scans documentation folders, loads MD files, creates embeddings,
+    and saves a FAISS vector database.
+    """
+    markdown_files = []
+    for root, _, files in os.walk(DEVICE_DOCS_PATH):
+        for file in files:
+            if file.endswith(".md"):
+                markdown_files.append(os.path.join(root, file))
+    for root, _, files in os.walk(FABRIC_DOCS_PATH):
+        for file in files:
+            if file.endswith(".md"):
+                markdown_files.append(os.path.join(root, file))
+    if not markdown_files:
+        print("No markdown files found in the specified directories.")
+        return
+    print(f"Found {len(markdown_files)} markdown files to process.")
+    # Load documents
+    documents = load_markdown_documents(markdown_files)
+    print(f"Loaded {len(documents)} documents.")
+    # Define headers to split on
+    headers_to_split_on = [
+        ("#", "header1"),
+        ("##", "header2"),
+        ("###", "header3"),
+    ]
+    # First split by headers to maintain context
+    header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+    # Split documents by headers first
+    header_split_docs = []
+    for doc in documents:
+        try:
+            header_split = header_splitter.split_text(doc.page_content)
+            for split_doc in header_split:
+                # Copy metadata from original document
+                split_doc.metadata.update(doc.metadata)
+            header_split_docs.extend(header_split)
+        except Exception as e:
+            print(f"Warning: Could not split by headers: {e}")
+            # If header splitting fails, keep the original document
+            header_split_docs.append(doc)
+    # Then do recursive character splitting with smaller chunks and larger overlap
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
+    texts = text_splitter.split_documents(header_split_docs)
+    print(f"Split documents into {len(texts)} chunks.")
+    # Add device context to each chunk's page_content if it's from a device file
+    for text_chunk in texts:
+        if 'device_name' in text_chunk.metadata:
+            device_name = text_chunk.metadata['device_name']
+            # Prepend device name to the content of the chunk
+            # Ensure it's not already prepended (e.g. if a header itself was the device name)
+            if not text_chunk.page_content.strip().startswith(f"Device: {device_name}"):
+                 text_chunk.page_content = f"Device: {device_name}\\n\\n{text_chunk.page_content}"
+    print("Creating FAISS vector database...")
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    print("Embeddings model loaded.")
+    # Create FAISS vector store
+    if not texts:
+        print("No text chunks to process for FAISS index.")
+        return
+    print("Creating FAISS index...")
+    vector_db = FAISS.from_documents(texts, embeddings)
+    print("FAISS index created.")
+    # Save FAISS index
+    vector_db.save_local(FAISS_INDEX_PATH)
+    print(f"FAISS index saved to {FAISS_INDEX_PATH}")
+if __name__ == "__main__":
+    create_vector_db()

requirements.txt CHANGED Viewed

@@ -4,4 +4,8 @@ sentence-transformers
 smolagents
 gradio
 smolagents[litellm]
-einops

 smolagents
 gradio
 smolagents[litellm]
+einops
+langchain-community
+langchain
+faiss-cpu
+unstructured

test_loader.py ADDED Viewed

	@@ -0,0 +1,287 @@

+#!/usr/bin/env python3
+"""
+Test script for the FAISS vector database created by loader.py.
+Allows interactive querying of the documentation and searching for specific strings in results.
+"""
+import os
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+# Configuration
+FAISS_INDEX_PATH = "faiss_index"
+EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+def load_vector_db():
+    """
+    Load the FAISS vector database from disk.
+    """
+    if not os.path.exists(FAISS_INDEX_PATH):
+        print(f"Error: FAISS index not found at {FAISS_INDEX_PATH}")
+        print("Please run loader.py first to create the vector database.")
+        return None
+    try:
+        # Initialize embeddings (must use same model as used for creating the index)
+        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
+        # Load FAISS index
+        vector_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
+        print(f"Successfully loaded FAISS index from {FAISS_INDEX_PATH}")
+        return vector_db
+    except Exception as e:
+        print(f"Error loading FAISS index: {e}")
+        return None
+def search_documents(vector_db, query, k=3):
+    """
+    Search the vector database for documents similar to the query.
+    Args:
+        vector_db: The loaded FAISS vector store
+        query: The search query string
+        k: Number of top results to return
+    Returns:
+        List of documents with similarity scores
+    """
+    try:
+        # Perform similarity search with scores
+        docs_with_scores = vector_db.similarity_search_with_score(query, k=k)
+        return docs_with_scores
+    except Exception as e:
+        print(f"Error during search: {e}")
+        return []
+def find_string_in_results(docs_with_scores, search_string):
+    """
+    Find specific strings in the search results.
+    Args:
+        docs_with_scores: List of (document, score) tuples from similarity search
+        search_string: String to search for in the documents
+    Returns:
+        List of matches with context
+    """
+    matches = []
+    for i, (doc, score) in enumerate(docs_with_scores):
+        content = doc.page_content.lower()
+        search_lower = search_string.lower()
+        if search_lower in content:
+            # Find all occurrences
+            start = 0
+            while True:
+                pos = content.find(search_lower, start)
+                if pos == -1:
+                    break
+                # Extract context around the match (100 chars before and after)
+                context_start = max(0, pos - 100)
+                context_end = min(len(doc.page_content), pos + len(search_string) + 100)
+                context = doc.page_content[context_start:context_end]
+                matches.append({
+                    'result_index': i + 1,
+                    'source': doc.metadata.get('source', 'Unknown'),
+                    'similarity_score': score,
+                    'context': context,
+                    'position': pos
+                })
+                start = pos + 1
+    return matches
+def print_search_results(docs_with_scores):
+    """
+    Print search results in a formatted way.
+    """
+    print(f"\n{'='*60}")
+    print(f"SEARCH RESULTS ({len(docs_with_scores)} results)")
+    print(f"{'='*60}")
+    for i, (doc, score) in enumerate(docs_with_scores, 1):
+        print(f"\n--- Result {i} (Similarity Score: {score:.4f}) ---")
+        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
+        print(f"Content Preview: {doc.page_content[:200]}...")
+        print("-" * 50)
+def print_string_matches(matches, search_string):
+    """
+    Print string search matches in a formatted way.
+    """
+    if not matches:
+        print(f"\n❌ No matches found for '{search_string}' in the search results.")
+        return
+    print(f"\n{'='*60}")
+    print(f"STRING SEARCH RESULTS for '{search_string}' ({len(matches)} matches)")
+    print(f"{'='*60}")
+    for match in matches:
+        print(f"\n✅ Match found in Result #{match['result_index']}")
+        print(f"Source: {match['source']}")
+        print(f"Similarity Score: {match['similarity_score']:.4f}")
+        print(f"Context: ...{match['context']}...")
+        print("-" * 50)
+# Test cases configuration
+TEST_CASES = [
+    {
+        "question": "What is the management IP address of DCX-L2LEAF1A?",
+        "expected_string": "172.20.20.57"
+    },
+    {
+        "question": "What VLANs are on DCX-L2LEAF1A?",
+        "expected_string": "VRF10_VLAN11"
+    },
+    {
+        "question": "What spanning tree mode is configured?",
+        "expected_string": "mstp"
+    },
+    {
+        "question": "What is the NTP server configured?",
+        "expected_string": "0.pool.ntp.org"
+    },
+    {
+        "question": "What VRF is used for management?",
+        "expected_string": "MGMT"
+    },
+    {
+        "question": "What is the default gateway for management?",
+        "expected_string": "172.20.20.1"
+    },
+    {
+        "question": "What ethernet interfaces are on DCX-L2LEAF1A?",
+        "expected_string": "Ethernet1"
+    },
+    {
+        "question": "What port-channel interfaces exist?",
+        "expected_string": "Port-Channel1"
+    },
+    {
+        "question": "What is the TerminAttr daemon configuration?",
+        "expected_string": "apiserver.arista.io"
+    },
+    {
+        "question": "What local users are configured?",
+        "expected_string": "admin"
+    },
+    {
+        "question": "What's the description of Ethernet5 on DCX-L2LEAF1A?",
+        "expected_string": "DCX-leaf1-server1_iLO"
+    },
+    {
+        "question": "What channel group is configured on DCX-L2LEAF1A Ethernet1?",
+        "expected_string": "channel-group 1"
+    },
+    {
+        "question": "What VLAN access mode is on DCX-L2LEAF1A Ethernet5?",
+        "expected_string": "access vlan 11"
+    },
+    {
+        "question": "What is the DNS server configured?",
+        "expected_string": "8.8.8.8"
+    },
+    {
+        "question": "What protocol is used for management API on DCX-L2LEAF1A?",
+        "expected_string": "protocol https"
+    }
+]
+def run_automated_tests(vector_db):
+    """
+    Run automated tests using predefined test cases.
+    """
+    print("\n🧪 Running Automated FAISS Database Tests")
+    print("=" * 60)
+    total_tests = len(TEST_CASES)
+    passed_tests = 0
+    failed_tests = 0
+    for i, test_case in enumerate(TEST_CASES, 1):
+        question = test_case["question"]
+        expected_string = test_case["expected_string"]
+        print(f"\n📝 Test {i}/{total_tests}: {question}")
+        print(f"Expected to find: '{expected_string}'")
+        print("-" * 50)
+        try:
+            # Perform semantic search (increase k to get more results)
+            docs_with_scores = search_documents(vector_db, question, k=10)
+            if not docs_with_scores:
+                print("❌ FAIL: No search results found")
+                failed_tests += 1
+                continue
+            # Search for the expected string in results
+            matches = find_string_in_results(docs_with_scores, expected_string)
+            if matches:
+                print(f"✅ PASS: Found '{expected_string}' in search results")
+                print(f"   Found in: {matches[0]['source']}")
+                print(f"   Similarity Score: {matches[0]['similarity_score']:.4f}")
+                print(f"   Context: ...{matches[0]['context'][:100]}...")
+                passed_tests += 1
+            else:
+                print(f"❌ FAIL: '{expected_string}' not found in search results")
+                print("   Search results sources (top 5):")
+                for j, (doc, score) in enumerate(docs_with_scores[:5]):
+                    print(f"   - {doc.metadata.get('source', 'Unknown')} (score: {score:.4f})")
+                # Debug: show content preview of top result
+                if docs_with_scores:
+                    top_doc = docs_with_scores[0][0]
+                    print(f"   Top result content preview: {top_doc.page_content[:200]}...")
+                failed_tests += 1
+        except Exception as e:
+            print(f"❌ ERROR: {e}")
+            failed_tests += 1
+    # Print summary
+    print("\n" + "=" * 60)
+    print("🏁 TEST SUMMARY")
+    print("=" * 60)
+    print(f"Total Tests: {total_tests}")
+    print(f"✅ Passed: {passed_tests}")
+    print(f"❌ Failed: {failed_tests}")
+    print(f"Success Rate: {(passed_tests/total_tests)*100:.1f}%")
+    if failed_tests > 0:
+        print(f"\n⚠️  {failed_tests} test(s) failed. Check the results above.")
+        return False
+    else:
+        print(f"\n🎉 All tests passed!")
+        return True
+def main():
+    """
+    Main function to run the automated test script.
+    """
+    print("🚀 Loading FAISS Vector Database...")
+    # Load the vector database
+    vector_db = load_vector_db()
+    if vector_db is None:
+        return
+    # Run automated tests
+    success = run_automated_tests(vector_db)
+    # Exit with appropriate code
+    if not success:
+        exit(1)
+    else:
+        print("\n✅ All tests completed successfully!")
+        exit(0)
+if __name__ == "__main__":
+    main()