File size: 7,201 Bytes
5c8e4ec
 
 
 
 
 
 
 
 
 
42d7509
5c8e4ec
 
 
 
 
 
 
 
 
 
42d7509
 
 
 
 
 
 
 
 
5c8e4ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42d7509
5c8e4ec
 
 
 
 
 
 
 
 
42d7509
 
 
 
 
 
5c8e4ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Data persistence for document system."""

import json
import os
from typing import Dict, Optional, Tuple
import numpy as np


def save_document_system(document_index: Dict, tag_embeddings: Dict, 
                        doc_tag_mapping: Dict, chunk_embeddings: Dict = None, 
                        output_dir: str = None):
    """Save the complete document indexing system.
    
    Args:
        document_index: Document index dictionary.
        tag_embeddings: Tag embeddings dictionary.
        doc_tag_mapping: Document-tag mapping dictionary.
        chunk_embeddings: Chunk embeddings dictionary (optional).
        output_dir: Output directory for saved files.
    """
    
    if output_dir is None:
        # Get project root directory
        from pathlib import Path
        root_dir = Path(__file__).parent.parent.parent.parent
        output_dir = root_dir / 'embeddings' / 'pdfembeddings'
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Save document index (content + metadata + chunks)
    doc_index_serializable = {}
    for doc_name, doc_info in document_index.items():
        doc_index_serializable[doc_name] = {
            'full_content': doc_info.get('full_content', doc_info.get('content', '')),
            'chunks': doc_info.get('chunks', []),
            'symptoms': doc_info['symptoms'],
            'diagnoses': doc_info['diagnoses'],
            'treatments': doc_info.get('treatments', []),
            'all_tags': doc_info['all_tags']
        }
    
    with open(os.path.join(output_dir, 'document_index.json'), 'w', encoding='utf-8') as f:
        json.dump(doc_index_serializable, f, indent=2, ensure_ascii=False)
    
    # Save tag embeddings
    tag_embeddings_serializable = {
        tag: embedding.tolist() for tag, embedding in tag_embeddings.items()
    }
    with open(os.path.join(output_dir, 'tag_embeddings.json'), 'w', encoding='utf-8') as f:
        json.dump(tag_embeddings_serializable, f, indent=2, ensure_ascii=False)
    
    # Save document-tag mapping
    doc_tag_serializable = {}
    for doc_name, doc_info in doc_tag_mapping.items():
        doc_tag_serializable[doc_name] = {
            'tags': doc_info['tags'],
            'symptoms': doc_info['symptoms'],
            'diagnoses': doc_info['diagnoses'],
            'treatments': doc_info['treatments'],
            'tag_embeddings': {
                tag: embedding.tolist() 
                for tag, embedding in doc_info['tag_embeddings'].items()
            }
        }
    
    with open(os.path.join(output_dir, 'document_tag_mapping.json'), 'w', encoding='utf-8') as f:
        json.dump(doc_tag_serializable, f, indent=2, ensure_ascii=False)
    
    # Save chunk embeddings if provided
    if chunk_embeddings:
        chunk_embeddings_serializable = {}
        for doc_name, chunks in chunk_embeddings.items():
            chunk_embeddings_serializable[doc_name] = []
            for chunk in chunks:
                chunk_embeddings_serializable[doc_name].append({
                    'chunk_id': chunk['chunk_id'],
                    'text': chunk['text'],
                    'start_char': chunk.get('start_char', 0),
                    'end_char': chunk.get('end_char', len(chunk['text'])),
                    'token_count': chunk.get('token_count', len(chunk['text'].split())),
                    'embedding': chunk['embedding'].tolist()
                })
        
        with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
            json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
    
    print("βœ… Document system saved to files")


def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict]]:
    """Load the complete document indexing system.
    
    Args:
        input_dir: Input directory containing saved files.
        
    Returns:
        Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
        Returns (None, None, None, None) if loading fails.
    """
    if input_dir is None:
        # Get project root directory
        from pathlib import Path
        root_dir = Path(__file__).parent.parent.parent.parent
        input_dir = root_dir / 'embeddings' / 'pdfembeddings'
    
    try:
        # Load document index
        with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
            document_index = json.load(f)
        
        # Load tag embeddings
        with open(os.path.join(input_dir, 'tag_embeddings.json'), 'r', encoding='utf-8') as f:
            tag_embeddings_data = json.load(f)
            tag_embeddings = {
                tag: np.array(embedding) 
                for tag, embedding in tag_embeddings_data.items()
            }
        
        # Load document-tag mapping
        with open(os.path.join(input_dir, 'document_tag_mapping.json'), 'r', encoding='utf-8') as f:
            doc_tag_data = json.load(f)
            doc_tag_mapping = {}
            for doc_name, doc_info in doc_tag_data.items():
                doc_tag_mapping[doc_name] = {
                    'tags': doc_info['tags'],
                    'symptoms': doc_info['symptoms'],
                    'diagnoses': doc_info['diagnoses'],
                    'treatments': doc_info['treatments'],
                    'tag_embeddings': {
                        tag: np.array(embedding)
                        for tag, embedding in doc_info['tag_embeddings'].items()
                    }
                }
        
        # Try to load chunk embeddings if they exist
        chunk_embeddings = None
        chunk_embeddings_path = os.path.join(input_dir, 'chunk_embeddings.json')
        if os.path.exists(chunk_embeddings_path):
            with open(chunk_embeddings_path, 'r', encoding='utf-8') as f:
                chunk_data = json.load(f)
                chunk_embeddings = {}
                for doc_name, chunks in chunk_data.items():
                    chunk_embeddings[doc_name] = []
                    for chunk in chunks:
                        chunk_embeddings[doc_name].append({
                            'chunk_id': chunk['chunk_id'],
                            'text': chunk['text'],
                            'start_char': chunk.get('start_char', 0),
                            'end_char': chunk.get('end_char', len(chunk['text'])),
                            'token_count': chunk.get('token_count', len(chunk['text'].split())),
                            # Backward compatibility for old format
                            'start_word': chunk.get('start_word', 0),
                            'end_word': chunk.get('end_word', len(chunk['text'].split())),
                            'embedding': np.array(chunk['embedding'])
                        })
            print("βœ… Chunk embeddings loaded")
        
        print("βœ… Document system loaded successfully")
        return document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings
        
    except Exception as e:
        print(f"❌ Failed to load document system: {e}")
        return None, None, None, None