Testys commited on
Commit
6ecbc91
Β·
1 Parent(s): 748bb2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -37
app.py CHANGED
@@ -1,5 +1,70 @@
1
  import streamlit as st
2
  from search_utils import SemanticSearch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def main():
5
  st.set_page_config(
@@ -8,64 +73,162 @@ def main():
8
  layout="wide"
9
  )
10
 
11
- # Initialize search system first
12
- @st.cache_resource
13
  def init_search_system():
14
- system = SemanticSearch()
15
- system.initialize_system()
16
- return system
 
 
 
 
 
 
17
 
18
- # Custom CSS moved outside cached function
19
  st.markdown("""
20
  <style>
21
  div[data-testid="stExpander"] div[role="button"] p {
22
  font-size: 1.2rem;
23
  font-weight: bold;
 
24
  }
25
  a.source-link {
26
- color: #1e88e5 !important;
27
  text-decoration: none !important;
28
- border-bottom: 1px dotted #1e88e5;
 
29
  }
30
  a.source-link:hover {
31
- opacity: 0.8;
32
- border-bottom-style: solid;
 
 
 
 
 
 
 
33
  }
34
  </style>
35
  """, unsafe_allow_html=True)
36
 
37
- search_system = init_search_system()
 
 
 
 
38
 
39
  # Main UI components
40
  st.title("πŸ” Semantic Search Engine")
41
- query = st.text_input("Enter your search query:", placeholder="Search documents...")
42
-
 
 
 
 
43
  if query:
44
- with st.spinner("πŸ” Searching through documents..."):
45
- results = search_system.search(query, 5)
46
-
47
- if not results.empty:
48
- st.subheader("Top Results")
49
- for _, row in results.iterrows():
50
- with st.expander(f"{row['title']} (Similarity: {row['similarity']:.1%})"):
51
- st.markdown(f"**Summary**: {row['summary']}")
52
- st.markdown(f"<a class='source-link' href='{row['source']}' target='_blank'>View Source</a>",
53
- unsafe_allow_html=True)
54
- else:
55
- st.warning("No matching documents found")
56
-
57
- # System status sidebar
58
- with st.sidebar:
59
- st.subheader("System Status")
60
- st.metric("Total Documents", f"{search_system.metadata_mgr.total_docs:,}")
61
- st.metric("FAISS Shards", len(search_system.index_shards))
62
- st.metric("Metadata Shards", len(search_system.metadata_mgr.shard_map))
63
-
64
- # Sidebar controls outside main query block
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  with st.sidebar:
66
- if st.button("Clear Cache"):
67
- st.cache_resource.clear()
68
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  if __name__ == "__main__":
71
  main()
 
1
  import streamlit as st
2
  from search_utils import SemanticSearch
3
+ import logging
4
+ import time
5
+ import os
6
+ import sys
7
+ import psutil # Added missing import
8
+ from urllib.parse import urlparse
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
14
+ handlers=[
15
+ logging.StreamHandler()
16
+ ]
17
+ )
18
+ logger = logging.getLogger("SemanticSearchApp")
19
+
20
+ # Security validation functions
21
+ def is_valid_url(url):
22
+ """Validate URL format and safety"""
23
+ try:
24
+ result = urlparse(url)
25
+ if not all([result.scheme, result.netloc]):
26
+ return False
27
+ # Add additional security checks here
28
+ return True
29
+ except:
30
+ return False
31
+
32
+ def sanitize_query(query):
33
+ """Sanitize user input to prevent injection attacks"""
34
+ return re.sub(r'[^\w\s-]', '', query)[:256]
35
+
36
+ # Diagnostics integration
37
+ try:
38
+ from diagnostics import diagnose_parquet_files
39
+ diagnostics_available = True
40
+ except ImportError:
41
+ diagnostics_available = False
42
+ logger.warning("Diagnostics module not available")
43
+
44
+ def add_diagnostics_ui(search_system):
45
+ """Enhanced diagnostics UI with system checks"""
46
+ with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
47
+ if st.button("Run Full System Check"):
48
+ with st.spinner("Performing comprehensive system check..."):
49
+ col1, col2 = st.columns(2)
50
+
51
+ # Metadata check
52
+ with col1:
53
+ if diagnose_parquet_files("metadata_shards"):
54
+ st.success("βœ… Metadata shards valid")
55
+ else:
56
+ st.error("❌ Metadata issues detected")
57
+
58
+ # Index check
59
+ with col2:
60
+ if len(search_system.index_shards) > 0:
61
+ st.success(f"βœ… {len(search_system.index_shards)} FAISS shards loaded")
62
+ else:
63
+ st.error("❌ No FAISS shards found")
64
+
65
+ # Resource check
66
+ st.metric("Memory Usage", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB")
67
+ st.metric("CPU Utilization", f"{psutil.cpu_percent()}%")
68
 
69
  def main():
70
  st.set_page_config(
 
73
  layout="wide"
74
  )
75
 
76
+ # Initialize search system with enhanced caching
77
+ @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...")
78
  def init_search_system():
79
+ try:
80
+ system = SemanticSearch()
81
+ system.initialize_system()
82
+ logger.info("Search system initialized successfully")
83
+ return system
84
+ except Exception as e:
85
+ logger.error(f"System initialization failed: {str(e)}")
86
+ st.error("Critical system initialization error. Check logs.")
87
+ st.stop()
88
 
89
+ # Custom CSS with enhanced visual design
90
  st.markdown("""
91
  <style>
92
  div[data-testid="stExpander"] div[role="button"] p {
93
  font-size: 1.2rem;
94
  font-weight: bold;
95
+ color: #1e88e5;
96
  }
97
  a.source-link {
98
+ color: #1a73e8 !important;
99
  text-decoration: none !important;
100
+ border-bottom: 2px solid transparent;
101
+ transition: all 0.3s ease;
102
  }
103
  a.source-link:hover {
104
+ border-bottom-color: #1a73e8;
105
+ opacity: 0.9;
106
+ }
107
+ .similarity-badge {
108
+ padding: 0.2em 0.5em;
109
+ border-radius: 4px;
110
+ background: #e3f2fd;
111
+ color: #1e88e5;
112
+ font-weight: 500;
113
  }
114
  </style>
115
  """, unsafe_allow_html=True)
116
 
117
+ try:
118
+ search_system = init_search_system()
119
+ except Exception as e:
120
+ st.error(f"Failed to initialize search system: {str(e)}")
121
+ st.stop()
122
 
123
  # Main UI components
124
  st.title("πŸ” Semantic Search Engine")
125
+
126
+ # Search input with sanitization
127
+ query = st.text_input("Enter your search query:",
128
+ placeholder="Search documents...",
129
+ max_chars=200)
130
+
131
  if query:
132
+ try:
133
+ # Sanitize and validate query
134
+ clean_query = sanitize_query(query)
135
+ if not clean_query:
136
+ st.warning("Please enter a valid search query")
137
+ st.stop()
138
+
139
+ with st.spinner("πŸ” Searching through documents..."):
140
+ start_time = time.time()
141
+ results = search_system.search(clean_query, 5)
142
+ search_duration = time.time() - start_time
143
+
144
+ if not results.empty:
145
+ st.subheader(f"Top Results ({search_duration:.2f}s)")
146
+
147
+ # Visualize results with enhanced formatting
148
+ for _, row in results.iterrows():
149
+ with st.expander(f"{row['title']}"):
150
+ # Similarity visualization
151
+ col1, col2 = st.columns([3, 1])
152
+ with col1:
153
+ st.markdown(f"**Summary**: {row['summary']}")
154
+ with col2:
155
+ st.markdown(
156
+ f"<div class='similarity-badge'>"
157
+ f"Confidence: {row['similarity']:.1%}"
158
+ f"</div>",
159
+ unsafe_allow_html=True
160
+ )
161
+ st.progress(float(row['similarity']))
162
+
163
+ # Safe URL handling
164
+ if is_valid_url(row['source']):
165
+ st.markdown(
166
+ f"<a class='source-link' href='{row['source']}' "
167
+ f"target='_blank' rel='noopener noreferrer'>"
168
+ f"🌐 View Source</a>",
169
+ unsafe_allow_html=True
170
+ )
171
+ else:
172
+ st.warning("Invalid source URL")
173
+ else:
174
+ st.warning("No matching documents found")
175
+ st.info("Try these tips:")
176
+ st.markdown("""
177
+ - Use more specific keywords
178
+ - Check your spelling
179
+ - Avoid special characters
180
+ """)
181
+
182
+ except Exception as e:
183
+ logger.error(f"Search failed: {str(e)}")
184
+ st.error("Search operation failed. Please try again.")
185
+
186
+ # System monitoring sidebar
187
  with st.sidebar:
188
+ st.subheader("πŸ“Š System Status")
189
+ col1, col2 = st.columns(2)
190
+
191
+ with col1:
192
+ st.metric("Total Documents",
193
+ f"{search_system.metadata_mgr.total_docs:,}",
194
+ help="Total indexed documents in system")
195
+
196
+ with col2:
197
+ st.metric("FAISS Shards",
198
+ len(search_system.index_shards),
199
+ help="Number of loaded vector index shards")
200
+
201
+ st.metric("Active Memory",
202
+ f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
203
+ help="Current memory usage by the application")
204
+
205
+ # Diagnostics section
206
+ if diagnostics_available:
207
+ add_diagnostics_ui(search_system)
208
+ else:
209
+ st.warning("Diagnostics module not available")
210
+
211
+ # Health check with error handling
212
+ if st.button("🩺 Run Health Check"):
213
+ try:
214
+ system_stats = {
215
+ "shards_loaded": len(search_system.index_shards),
216
+ "metadata_records": search_system.metadata_mgr.total_docs,
217
+ "memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
218
+ "active_threads": threading.active_count(),
219
+ "system_load": f"{os.getloadavg()[0]:.2f}"
220
+ }
221
+ st.json(system_stats)
222
+ except Exception as e:
223
+ st.error(f"Health check failed: {str(e)}")
224
+
225
+ # Cache management
226
+ if st.button("♻️ Clear Cache"):
227
+ try:
228
+ st.cache_resource.clear()
229
+ st.rerun()
230
+ except Exception as e:
231
+ st.error(f"Cache clearance failed: {str(e)}")
232
 
233
  if __name__ == "__main__":
234
  main()