Spaces:

rogerscuall
/

chat-with-avd-doc

Sleeping

File size: 7,260 Bytes

# /// script
# dependencies = [
#     "langchain_community",
#     "langchain_core",
# ]
# ///
"""
Enhanced loader script for creating FAISS vector database from Markdown documentation
with improved header metadata extraction.
"""

import os
import re
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

DOCS_DIR = "documentation"
DEVICE_DOCS_PATH = os.path.join(DOCS_DIR, "devices")
FABRIC_DOCS_PATH = os.path.join(DOCS_DIR, "fabric")
FAISS_INDEX_PATH = "faiss_index"

def extract_header_context(content, chunk_start_pos):
    """
    Extract the header hierarchy for a given position in the markdown content.
    Returns a dict with header levels and creates header_path and section_title.
    """
    lines = content[:chunk_start_pos].split('\n')
    headers = {}
    
    # Track the current header hierarchy
    for line in lines:
        line = line.strip()
        if line.startswith('#') and not line.startswith('#!'):  # Exclude shebang
            # Count the number of # to determine header level
            level = len(line) - len(line.lstrip('#'))
            if 1 <= level <= 5:  # Only process header levels 1-5
                header_text = line.lstrip('#').strip()
                headers[f'header{level}'] = header_text
                # Clear lower level headers when we encounter a higher level
                for i in range(level + 1, 6):
                    if f'header{i}' in headers:
                        del headers[f'header{i}']
    
    return headers

def enhance_chunk_metadata(chunk, original_content, chunk_position, file_metadata):
    """
    Enhance a chunk with header metadata and other contextual information.
    """
    # Start with file-level metadata
    enhanced_metadata = file_metadata.copy()
    
    # Extract header context for this chunk position
    header_context = extract_header_context(original_content, chunk_position)
    enhanced_metadata.update(header_context)
    
    # Create header path from all header levels
    header_path_parts = []
    for i in range(1, 6):  # header1 through header5
        if f'header{i}' in enhanced_metadata:
            header_path_parts.append(enhanced_metadata[f'header{i}'])
    
    if header_path_parts:
        enhanced_metadata['header_path'] = " > ".join(header_path_parts)
        enhanced_metadata['section_title'] = header_path_parts[-1]  # Most specific header
    
    return enhanced_metadata

def load_markdown_documents_with_headers(file_paths):
    """
    Loads markdown documents and creates chunks with enhanced header metadata.
    """
    all_documents = []
    
    for file_path in file_paths:
        print(f"Processing: {os.path.basename(file_path)}")
        
        # Read the raw markdown content
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Create base metadata for this file
        file_metadata = {
            'source': os.path.basename(file_path)
        }
        
        # Add device_name if it's a device file
        if 'DCX-' in os.path.basename(file_path):
            file_metadata['device_name'] = os.path.basename(file_path).replace('.md', '')
        
        # Split content into chunks using RecursiveCharacterTextSplitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800, 
            chunk_overlap=200,
            separators=["\n## ", "\n### ", "\n#### ", "\n##### ", "\n\n", "\n", " ", ""]
        )
        
        chunks = text_splitter.split_text(content)
        
        for chunk in chunks:
            # Find the position of this chunk in the original content
            chunk_position = content.find(chunk)
            if chunk_position == -1:
                # If exact match not found, try finding a shorter prefix
                chunk_start = chunk[:min(100, len(chunk))]
                chunk_position = content.find(chunk_start)
                if chunk_position == -1:
                    chunk_position = 0  # Fallback to beginning
            
            # Enhance metadata with header context
            enhanced_metadata = enhance_chunk_metadata(chunk, content, chunk_position, file_metadata)
            
            # Add device context to content if it's a device file
            final_content = chunk
            if 'device_name' in enhanced_metadata:
                device_name = enhanced_metadata['device_name']
                if not chunk.strip().startswith(f"Device: {device_name}"):
                    final_content = f"Device: {device_name}\\n\\n{chunk}"
            
            # Create document with enhanced metadata
            doc = Document(page_content=final_content, metadata=enhanced_metadata)
            all_documents.append(doc)
    
    return all_documents

def create_vector_db():
    """
    Scans documentation folders, loads MD files with enhanced header metadata,
    creates embeddings, and saves a FAISS vector database.
    """
    markdown_files = []
    
    # Collect all markdown files
    for root, _, files in os.walk(DEVICE_DOCS_PATH):
        for file in files:
            if file.endswith(".md"):
                markdown_files.append(os.path.join(root, file))

    for root, _, files in os.walk(FABRIC_DOCS_PATH):
        for file in files:
            if file.endswith(".md"):
                markdown_files.append(os.path.join(root, file))

    if not markdown_files:
        print("No markdown files found in the specified directories.")
        return

    print(f"Found {len(markdown_files)} markdown files to process.")

    # Load documents with enhanced header metadata
    documents = load_markdown_documents_with_headers(markdown_files)
    print(f"Created {len(documents)} document chunks with header metadata.")
    
    # Debug: Print sample metadata from first few chunks
    print("\\nSample metadata from first 3 chunks:")
    for i, doc in enumerate(documents[:3]):
        print(f"\\nChunk {i+1}:")
        print(f"  Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"  Device: {doc.metadata.get('device_name', 'N/A')}")
        print(f"  Header Path: {doc.metadata.get('header_path', 'No headers')}")
        print(f"  Section Title: {doc.metadata.get('section_title', 'No section')}")
        print(f"  Content Preview: {doc.page_content[:100]}...")

    print("\\nCreating FAISS vector database...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("Embeddings model loaded.")

    # Create FAISS vector store
    if not documents:
        print("No documents to process for FAISS index.")
        return
        
    print("Creating FAISS index...")
    vector_db = FAISS.from_documents(documents, embeddings)
    print("FAISS index created.")

    # Save FAISS index
    vector_db.save_local(FAISS_INDEX_PATH)
    print(f"FAISS index saved to {FAISS_INDEX_PATH}")
    print(f"Total chunks in database: {len(documents)}")

if __name__ == "__main__":
    create_vector_db()