File size: 7,260 Bytes
890d952
 
 
 
 
 
eb4910e
890d952
 
eb4910e
 
 
890d952
eb4910e
 
 
 
890d952
eb4910e
 
 
 
 
 
890d952
eb4910e
890d952
 
eb4910e
890d952
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb4910e
890d952
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb4910e
 
 
890d952
 
eb4910e
 
890d952
 
eb4910e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890d952
 
 
eb4910e
890d952
 
 
 
 
 
 
 
 
eb4910e
890d952
eb4910e
 
 
 
890d952
 
eb4910e
 
 
890d952
eb4910e
 
 
 
 
890d952
eb4910e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# /// script
# dependencies = [
#     "langchain_community",
#     "langchain_core",
# ]
# ///
"""
Enhanced loader script for creating FAISS vector database from Markdown documentation
with improved header metadata extraction.
"""

import os
import re
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

DOCS_DIR = "documentation"
DEVICE_DOCS_PATH = os.path.join(DOCS_DIR, "devices")
FABRIC_DOCS_PATH = os.path.join(DOCS_DIR, "fabric")
FAISS_INDEX_PATH = "faiss_index"

def extract_header_context(content, chunk_start_pos):
    """
    Extract the header hierarchy for a given position in the markdown content.
    Returns a dict with header levels and creates header_path and section_title.
    """
    lines = content[:chunk_start_pos].split('\n')
    headers = {}
    
    # Track the current header hierarchy
    for line in lines:
        line = line.strip()
        if line.startswith('#') and not line.startswith('#!'):  # Exclude shebang
            # Count the number of # to determine header level
            level = len(line) - len(line.lstrip('#'))
            if 1 <= level <= 5:  # Only process header levels 1-5
                header_text = line.lstrip('#').strip()
                headers[f'header{level}'] = header_text
                # Clear lower level headers when we encounter a higher level
                for i in range(level + 1, 6):
                    if f'header{i}' in headers:
                        del headers[f'header{i}']
    
    return headers

def enhance_chunk_metadata(chunk, original_content, chunk_position, file_metadata):
    """
    Enhance a chunk with header metadata and other contextual information.
    """
    # Start with file-level metadata
    enhanced_metadata = file_metadata.copy()
    
    # Extract header context for this chunk position
    header_context = extract_header_context(original_content, chunk_position)
    enhanced_metadata.update(header_context)
    
    # Create header path from all header levels
    header_path_parts = []
    for i in range(1, 6):  # header1 through header5
        if f'header{i}' in enhanced_metadata:
            header_path_parts.append(enhanced_metadata[f'header{i}'])
    
    if header_path_parts:
        enhanced_metadata['header_path'] = " > ".join(header_path_parts)
        enhanced_metadata['section_title'] = header_path_parts[-1]  # Most specific header
    
    return enhanced_metadata

def load_markdown_documents_with_headers(file_paths):
    """
    Loads markdown documents and creates chunks with enhanced header metadata.
    """
    all_documents = []
    
    for file_path in file_paths:
        print(f"Processing: {os.path.basename(file_path)}")
        
        # Read the raw markdown content
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Create base metadata for this file
        file_metadata = {
            'source': os.path.basename(file_path)
        }
        
        # Add device_name if it's a device file
        if 'DCX-' in os.path.basename(file_path):
            file_metadata['device_name'] = os.path.basename(file_path).replace('.md', '')
        
        # Split content into chunks using RecursiveCharacterTextSplitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800, 
            chunk_overlap=200,
            separators=["\n## ", "\n### ", "\n#### ", "\n##### ", "\n\n", "\n", " ", ""]
        )
        
        chunks = text_splitter.split_text(content)
        
        for chunk in chunks:
            # Find the position of this chunk in the original content
            chunk_position = content.find(chunk)
            if chunk_position == -1:
                # If exact match not found, try finding a shorter prefix
                chunk_start = chunk[:min(100, len(chunk))]
                chunk_position = content.find(chunk_start)
                if chunk_position == -1:
                    chunk_position = 0  # Fallback to beginning
            
            # Enhance metadata with header context
            enhanced_metadata = enhance_chunk_metadata(chunk, content, chunk_position, file_metadata)
            
            # Add device context to content if it's a device file
            final_content = chunk
            if 'device_name' in enhanced_metadata:
                device_name = enhanced_metadata['device_name']
                if not chunk.strip().startswith(f"Device: {device_name}"):
                    final_content = f"Device: {device_name}\\n\\n{chunk}"
            
            # Create document with enhanced metadata
            doc = Document(page_content=final_content, metadata=enhanced_metadata)
            all_documents.append(doc)
    
    return all_documents

def create_vector_db():
    """
    Scans documentation folders, loads MD files with enhanced header metadata,
    creates embeddings, and saves a FAISS vector database.
    """
    markdown_files = []
    
    # Collect all markdown files
    for root, _, files in os.walk(DEVICE_DOCS_PATH):
        for file in files:
            if file.endswith(".md"):
                markdown_files.append(os.path.join(root, file))

    for root, _, files in os.walk(FABRIC_DOCS_PATH):
        for file in files:
            if file.endswith(".md"):
                markdown_files.append(os.path.join(root, file))

    if not markdown_files:
        print("No markdown files found in the specified directories.")
        return

    print(f"Found {len(markdown_files)} markdown files to process.")

    # Load documents with enhanced header metadata
    documents = load_markdown_documents_with_headers(markdown_files)
    print(f"Created {len(documents)} document chunks with header metadata.")
    
    # Debug: Print sample metadata from first few chunks
    print("\\nSample metadata from first 3 chunks:")
    for i, doc in enumerate(documents[:3]):
        print(f"\\nChunk {i+1}:")
        print(f"  Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"  Device: {doc.metadata.get('device_name', 'N/A')}")
        print(f"  Header Path: {doc.metadata.get('header_path', 'No headers')}")
        print(f"  Section Title: {doc.metadata.get('section_title', 'No section')}")
        print(f"  Content Preview: {doc.page_content[:100]}...")

    print("\\nCreating FAISS vector database...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    print("Embeddings model loaded.")

    # Create FAISS vector store
    if not documents:
        print("No documents to process for FAISS index.")
        return
        
    print("Creating FAISS index...")
    vector_db = FAISS.from_documents(documents, embeddings)
    print("FAISS index created.")

    # Save FAISS index
    vector_db.save_local(FAISS_INDEX_PATH)
    print(f"FAISS index saved to {FAISS_INDEX_PATH}")
    print(f"Total chunks in database: {len(documents)}")

if __name__ == "__main__":
    create_vector_db()