Testys commited on
Commit
4abeab7
Β·
1 Parent(s): 5cefd40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -89
app.py CHANGED
@@ -1,46 +1,40 @@
1
  import streamlit as st
2
- from search_utils import SemanticSearch
3
  import logging
4
  import time
5
  import os
6
  import sys
7
- import psutil # Added missing import
8
- from urllib.parse import urlparse
9
  import threading
10
  import re
11
 
12
-
13
  # Configure logging
14
  logging.basicConfig(
15
  level=logging.INFO,
16
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17
- handlers=[
18
- logging.StreamHandler()
19
- ]
20
  )
21
  logger = logging.getLogger("SemanticSearchApp")
22
 
23
  # Security validation functions
24
- def is_valid_url(url):
25
- """Validate URL format and safety"""
26
  try:
27
  result = urlparse(url)
28
- if not all([result.scheme, result.netloc]):
29
- return False
30
- # Add additional security checks here
31
- return True
32
- except:
33
  return False
34
 
35
- def sanitize_query(query):
36
- """Sanitize user input to prevent injection attacks"""
37
  try:
38
- # Remove non-alphanumeric characters except spaces and hyphens
39
  clean_query = re.sub(r'[^\w\s-]', '', query)
40
- return clean_query[:256] # Truncate to prevent long queries
41
  except Exception as e:
42
  logger.error(f"Query sanitization failed: {str(e)}")
43
- return query[:256] # Fallback truncation
44
 
45
  # Diagnostics integration
46
  try:
@@ -50,29 +44,24 @@ except ImportError:
50
  diagnostics_available = False
51
  logger.warning("Diagnostics module not available")
52
 
53
- def add_diagnostics_ui(search_system):
54
- """Enhanced diagnostics UI with proper directory checks"""
55
  with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
56
  if st.button("Run Full System Check"):
57
  with st.spinner("Performing comprehensive system check..."):
58
- # Create columns for organized display
59
  col1, col2 = st.columns(2)
60
 
61
- # Get actual paths from the search system
62
- metadata_dir = search_system.metadata_mgr.shard_dir
63
- faiss_dir = search_system.shard_dir # From SemanticSearch class
64
 
65
  with col1:
66
- # Metadata directory check
67
  st.subheader("πŸ“‚ Metadata Validation")
68
  if metadata_dir.exists():
69
- # Check directory structure
70
- dir_status = any(metadata_dir.glob("*.parquet"))
71
  st.write(f"Directory: `{metadata_dir}`")
72
- st.write(f"Parquet Files Found: {'βœ…' if dir_status else '❌'}")
73
-
74
- # Check individual files
75
- if diagnose_parquet_files(str(metadata_dir)):
76
  st.success("βœ… Metadata shards valid")
77
  else:
78
  st.error("❌ Metadata issues detected")
@@ -80,33 +69,24 @@ def add_diagnostics_ui(search_system):
80
  st.error("Metadata directory not found")
81
 
82
  with col2:
83
- # FAISS index check
84
  st.subheader("πŸ“š FAISS Validation")
85
- if faiss_dir.exists():
86
- index_files = list(faiss_dir.glob("*.index"))
87
- st.write(f"Directory: `{faiss_dir}`")
88
- st.write(f"Index Files Found: {len(index_files)}")
89
-
90
- if len(search_system.index_shards) > 0:
91
- st.success(f"βœ… {len(search_system.index_shards)} FAISS shards loaded")
92
- st.write(f"Total Vectors: {sum(s.ntotal for s in search_system.index_shards):,}")
93
- else:
94
- st.error("❌ No FAISS shards loaded")
95
  else:
96
- st.error("FAISS directory not found")
97
 
98
- # System resource check
99
  st.subheader("πŸ’» System Resources")
100
  col_res1, col_res2 = st.columns(2)
101
  with col_res1:
102
  st.metric("Memory Usage",
103
- f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
104
- help="Current process memory usage")
105
-
106
  with col_res2:
107
  st.metric("CPU Utilization",
108
- f"{psutil.cpu_percent()}%",
109
- help="Total system CPU usage")
 
110
  def main():
111
  st.set_page_config(
112
  page_title="Semantic Search Engine",
@@ -114,12 +94,11 @@ def main():
114
  layout="wide"
115
  )
116
 
117
- # Initialize search system with enhanced caching
118
  @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...")
119
  def init_search_system():
120
  try:
121
- system = SemanticSearch()
122
- system.initialize_system()
123
  logger.info("Search system initialized successfully")
124
  return system
125
  except Exception as e:
@@ -127,7 +106,7 @@ def main():
127
  st.error("Critical system initialization error. Check logs.")
128
  st.stop()
129
 
130
- # Custom CSS with enhanced visual design
131
  st.markdown("""
132
  <style>
133
  div[data-testid="stExpander"] div[role="button"] p {
@@ -161,17 +140,15 @@ def main():
161
  st.error(f"Failed to initialize search system: {str(e)}")
162
  st.stop()
163
 
164
- # Main UI components
165
  st.title("πŸ” Semantic Search Engine")
166
 
167
- # Search input with sanitization
168
  query = st.text_input("Enter your search query:",
169
- placeholder="Search documents...",
170
- max_chars=200)
171
 
172
  if query:
173
  try:
174
- # Sanitize and validate query
175
  clean_query = sanitize_query(query)
176
  if not clean_query:
177
  st.warning("Please enter a valid search query")
@@ -179,29 +156,29 @@ def main():
179
 
180
  with st.spinner("πŸ” Searching through documents..."):
181
  start_time = time.time()
182
- results = search_system.search(clean_query, 5)
183
  search_duration = time.time() - start_time
184
 
185
- if not results.empty:
186
- st.subheader(f"Top Results ({search_duration:.2f}s)")
187
 
188
- # Visualize results with enhanced formatting
189
- for _, row in results.iterrows():
190
- with st.expander(f"{row['title']}"):
191
- # Similarity visualization
192
  col1, col2 = st.columns([3, 1])
193
  with col1:
194
- st.markdown(f"**Summary**: {row['summary']}")
195
  with col2:
196
  st.markdown(
197
  f"<div class='similarity-badge'>"
198
- f"Confidence: {row['similarity']:.1%}"
199
  f"</div>",
200
  unsafe_allow_html=True
201
  )
202
- st.progress(float(row['similarity']))
203
- if row['source']:
204
- st.markdown(row['source'], unsafe_allow_html=True)
 
 
205
  else:
206
  st.warning("Invalid source URL")
207
  else:
@@ -212,41 +189,36 @@ def main():
212
  - Check your spelling
213
  - Avoid special characters
214
  """)
215
-
216
  except Exception as e:
217
  logger.error(f"Search failed: {str(e)}")
218
  st.error("Search operation failed. Please try again.")
219
 
220
- # System monitoring sidebar
221
  with st.sidebar:
222
  st.subheader("πŸ“Š System Status")
223
  col1, col2 = st.columns(2)
224
-
225
  with col1:
226
  st.metric("Total Documents",
227
- f"{search_system.metadata_mgr.total_docs:,}",
228
- help="Total indexed documents in system")
229
-
230
  with col2:
231
- st.metric("FAISS Shards",
232
- len(search_system.index_shards),
233
- help="Number of loaded vector index shards")
234
-
235
  st.metric("Active Memory",
236
- f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
237
- help="Current memory usage by the application")
238
 
239
- # Diagnostics section
240
  if diagnostics_available:
241
  add_diagnostics_ui(search_system)
242
  else:
243
  st.warning("Diagnostics module not available")
244
 
245
- # Health check with error handling
246
  if st.button("🩺 Run Health Check"):
247
  try:
248
  system_stats = {
249
- "shards_loaded": len(search_system.index_shards),
250
  "metadata_records": search_system.metadata_mgr.total_docs,
251
  "memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
252
  "active_threads": threading.active_count(),
@@ -256,13 +228,12 @@ def main():
256
  except Exception as e:
257
  st.error(f"Health check failed: {str(e)}")
258
 
259
- # Cache management
260
  if st.button("♻️ Clear Cache"):
261
  try:
262
  st.cache_resource.clear()
263
- st.rerun()
264
  except Exception as e:
265
  st.error(f"Cache clearance failed: {str(e)}")
266
 
267
  if __name__ == "__main__":
268
- main()
 
1
  import streamlit as st
2
+ from search_utils import OptimizedSemanticSearch
3
  import logging
4
  import time
5
  import os
6
  import sys
7
+ import psutil
8
+ from urllib.parse import urlparse, quote
9
  import threading
10
  import re
11
 
 
12
  # Configure logging
13
  logging.basicConfig(
14
  level=logging.INFO,
15
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
16
+ handlers=[logging.StreamHandler()]
 
 
17
  )
18
  logger = logging.getLogger("SemanticSearchApp")
19
 
20
  # Security validation functions
21
+ def is_valid_url(url: str) -> bool:
22
+ """Validate URL format and safety."""
23
  try:
24
  result = urlparse(url)
25
+ return all([result.scheme, result.netloc])
26
+ except Exception:
 
 
 
27
  return False
28
 
29
+ def sanitize_query(query: str) -> str:
30
+ """Sanitize user input to prevent injection attacks."""
31
  try:
32
+ # Remove non-alphanumeric characters except spaces and hyphens.
33
  clean_query = re.sub(r'[^\w\s-]', '', query)
34
+ return clean_query[:256] # Truncate to prevent overly long queries.
35
  except Exception as e:
36
  logger.error(f"Query sanitization failed: {str(e)}")
37
+ return query[:256] # Fallback truncation.
38
 
39
  # Diagnostics integration
40
  try:
 
44
  diagnostics_available = False
45
  logger.warning("Diagnostics module not available")
46
 
47
+ def add_diagnostics_ui(search_system: OptimizedSemanticSearch):
48
+ """Enhanced diagnostics UI with proper directory checks."""
49
  with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
50
  if st.button("Run Full System Check"):
51
  with st.spinner("Performing comprehensive system check..."):
 
52
  col1, col2 = st.columns(2)
53
 
54
+ # Get actual paths from the search system.
55
+ metadata_dir = search_system.metadata_mgr.metadata_dir
56
+ faiss_dir = Path("compressed_shards") # Assuming FAISS index dir, if applicable.
57
 
58
  with col1:
 
59
  st.subheader("πŸ“‚ Metadata Validation")
60
  if metadata_dir.exists():
61
+ parquet_files = list(metadata_dir.glob("*.parquet"))
 
62
  st.write(f"Directory: `{metadata_dir}`")
63
+ st.write(f"Parquet Files Found: {'βœ…' if parquet_files else '❌'}")
64
+ if diagnostics_available and diagnose_parquet_files(str(metadata_dir)):
 
 
65
  st.success("βœ… Metadata shards valid")
66
  else:
67
  st.error("❌ Metadata issues detected")
 
69
  st.error("Metadata directory not found")
70
 
71
  with col2:
 
72
  st.subheader("πŸ“š FAISS Validation")
73
+ # For the FAISS index, we rely on our search system.
74
+ if hasattr(search_system, 'index') and search_system.index is not None:
75
+ st.success(f"βœ… FAISS index loaded with {search_system.index.ntotal} vectors")
 
 
 
 
 
 
 
76
  else:
77
+ st.error("❌ No FAISS index loaded")
78
 
 
79
  st.subheader("πŸ’» System Resources")
80
  col_res1, col_res2 = st.columns(2)
81
  with col_res1:
82
  st.metric("Memory Usage",
83
+ f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
84
+ help="Current process memory usage")
 
85
  with col_res2:
86
  st.metric("CPU Utilization",
87
+ f"{psutil.cpu_percent()}%",
88
+ help="Total system CPU usage")
89
+
90
  def main():
91
  st.set_page_config(
92
  page_title="Semantic Search Engine",
 
94
  layout="wide"
95
  )
96
 
97
+ # Initialize the optimized search system with caching.
98
  @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...")
99
  def init_search_system():
100
  try:
101
+ system = OptimizedSemanticSearch()
 
102
  logger.info("Search system initialized successfully")
103
  return system
104
  except Exception as e:
 
106
  st.error("Critical system initialization error. Check logs.")
107
  st.stop()
108
 
109
+ # Custom CSS for enhanced visual design.
110
  st.markdown("""
111
  <style>
112
  div[data-testid="stExpander"] div[role="button"] p {
 
140
  st.error(f"Failed to initialize search system: {str(e)}")
141
  st.stop()
142
 
 
143
  st.title("πŸ” Semantic Search Engine")
144
 
145
+ # Search input with sanitization.
146
  query = st.text_input("Enter your search query:",
147
+ placeholder="Search documents...",
148
+ max_chars=200)
149
 
150
  if query:
151
  try:
 
152
  clean_query = sanitize_query(query)
153
  if not clean_query:
154
  st.warning("Please enter a valid search query")
 
156
 
157
  with st.spinner("πŸ” Searching through documents..."):
158
  start_time = time.time()
159
+ results = search_system.search(clean_query, top_k=5)
160
  search_duration = time.time() - start_time
161
 
162
+ if results:
163
+ st.subheader(f"Top Results (Search completed in {search_duration:.2f}s)")
164
 
165
+ for res in results:
166
+ with st.expander(f"{res.get('title', 'Untitled')}"):
 
 
167
  col1, col2 = st.columns([3, 1])
168
  with col1:
169
+ st.markdown(f"**Summary:** {res.get('summary', '')}")
170
  with col2:
171
  st.markdown(
172
  f"<div class='similarity-badge'>"
173
+ f"Confidence: {res.get('similarity', 0):.1%}"
174
  f"</div>",
175
  unsafe_allow_html=True
176
  )
177
+ st.progress(float(res.get("similarity", 0)))
178
+ if res.get("source"):
179
+ # Validate URL before displaying.
180
+ if is_valid_url(res["source"]):
181
+ st.markdown(f"<a class='source-link' href='{res['source']}' target='_blank'>Source Link</a>", unsafe_allow_html=True)
182
  else:
183
  st.warning("Invalid source URL")
184
  else:
 
189
  - Check your spelling
190
  - Avoid special characters
191
  """)
 
192
  except Exception as e:
193
  logger.error(f"Search failed: {str(e)}")
194
  st.error("Search operation failed. Please try again.")
195
 
196
+ # Sidebar: System monitoring and diagnostics.
197
  with st.sidebar:
198
  st.subheader("πŸ“Š System Status")
199
  col1, col2 = st.columns(2)
 
200
  with col1:
201
  st.metric("Total Documents",
202
+ f"{search_system.metadata_mgr.total_docs:,}",
203
+ help="Total indexed documents in the system")
 
204
  with col2:
205
+ if hasattr(search_system, 'index'):
206
+ st.metric("FAISS Vectors",
207
+ f"{search_system.index.ntotal:,}",
208
+ help="Number of vectors in the FAISS index")
209
  st.metric("Active Memory",
210
+ f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
211
+ help="Current memory usage by the application")
212
 
 
213
  if diagnostics_available:
214
  add_diagnostics_ui(search_system)
215
  else:
216
  st.warning("Diagnostics module not available")
217
 
 
218
  if st.button("🩺 Run Health Check"):
219
  try:
220
  system_stats = {
221
+ "shards_loaded": 1 if hasattr(search_system, 'index') else 0,
222
  "metadata_records": search_system.metadata_mgr.total_docs,
223
  "memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
224
  "active_threads": threading.active_count(),
 
228
  except Exception as e:
229
  st.error(f"Health check failed: {str(e)}")
230
 
 
231
  if st.button("♻️ Clear Cache"):
232
  try:
233
  st.cache_resource.clear()
234
+ st.experimental_rerun()
235
  except Exception as e:
236
  st.error(f"Cache clearance failed: {str(e)}")
237
 
238
  if __name__ == "__main__":
239
+ main()