Testys commited on
Commit
c056209
Β·
1 Parent(s): 4abeab7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -176
app.py CHANGED
@@ -5,9 +5,10 @@ import time
5
  import os
6
  import sys
7
  import psutil
8
- from urllib.parse import urlparse, quote
9
  import threading
10
  import re
 
11
 
12
  # Configure logging
13
  logging.basicConfig(
@@ -18,74 +19,66 @@ logging.basicConfig(
18
  logger = logging.getLogger("SemanticSearchApp")
19
 
20
  # Security validation functions
21
- def is_valid_url(url: str) -> bool:
22
- """Validate URL format and safety."""
23
  try:
24
  result = urlparse(url)
25
  return all([result.scheme, result.netloc])
26
- except Exception:
27
  return False
28
 
29
- def sanitize_query(query: str) -> str:
30
- """Sanitize user input to prevent injection attacks."""
31
  try:
32
- # Remove non-alphanumeric characters except spaces and hyphens.
33
- clean_query = re.sub(r'[^\w\s-]', '', query)
34
- return clean_query[:256] # Truncate to prevent overly long queries.
35
  except Exception as e:
36
  logger.error(f"Query sanitization failed: {str(e)}")
37
- return query[:256] # Fallback truncation.
38
-
39
- # Diagnostics integration
40
- try:
41
- from diagnostics import diagnose_parquet_files
42
- diagnostics_available = True
43
- except ImportError:
44
- diagnostics_available = False
45
- logger.warning("Diagnostics module not available")
46
-
47
- def add_diagnostics_ui(search_system: OptimizedSemanticSearch):
48
- """Enhanced diagnostics UI with proper directory checks."""
49
  with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
50
- if st.button("Run Full System Check"):
51
- with st.spinner("Performing comprehensive system check..."):
52
- col1, col2 = st.columns(2)
53
-
54
- # Get actual paths from the search system.
55
- metadata_dir = search_system.metadata_mgr.metadata_dir
56
- faiss_dir = Path("compressed_shards") # Assuming FAISS index dir, if applicable.
57
-
58
- with col1:
59
- st.subheader("πŸ“‚ Metadata Validation")
60
- if metadata_dir.exists():
61
- parquet_files = list(metadata_dir.glob("*.parquet"))
62
- st.write(f"Directory: `{metadata_dir}`")
63
- st.write(f"Parquet Files Found: {'βœ…' if parquet_files else '❌'}")
64
- if diagnostics_available and diagnose_parquet_files(str(metadata_dir)):
65
- st.success("βœ… Metadata shards valid")
66
- else:
67
- st.error("❌ Metadata issues detected")
68
- else:
69
- st.error("Metadata directory not found")
70
-
71
- with col2:
72
- st.subheader("πŸ“š FAISS Validation")
73
- # For the FAISS index, we rely on our search system.
74
- if hasattr(search_system, 'index') and search_system.index is not None:
75
- st.success(f"βœ… FAISS index loaded with {search_system.index.ntotal} vectors")
76
- else:
77
- st.error("❌ No FAISS index loaded")
78
-
79
- st.subheader("πŸ’» System Resources")
80
- col_res1, col_res2 = st.columns(2)
81
- with col_res1:
82
- st.metric("Memory Usage",
83
- f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
84
- help="Current process memory usage")
85
- with col_res2:
86
- st.metric("CPU Utilization",
87
- f"{psutil.cpu_percent()}%",
88
- help="Total system CPU usage")
 
 
89
 
90
  def main():
91
  st.set_page_config(
@@ -94,146 +87,108 @@ def main():
94
  layout="wide"
95
  )
96
 
97
- # Initialize the optimized search system with caching.
98
- @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...")
99
- def init_search_system():
100
- try:
101
- system = OptimizedSemanticSearch()
102
- logger.info("Search system initialized successfully")
103
- return system
104
- except Exception as e:
105
- logger.error(f"System initialization failed: {str(e)}")
106
- st.error("Critical system initialization error. Check logs.")
107
- st.stop()
108
-
109
- # Custom CSS for enhanced visual design.
110
  st.markdown("""
111
  <style>
112
- div[data-testid="stExpander"] div[role="button"] p {
113
- font-size: 1.2rem;
114
- font-weight: bold;
115
- color: #1e88e5;
116
- }
117
- a.source-link {
118
- color: #1a73e8 !important;
119
- text-decoration: none !important;
120
- border-bottom: 2px solid transparent;
121
- transition: all 0.3s ease;
122
- }
123
- a.source-link:hover {
124
- border-bottom-color: #1a73e8;
125
- opacity: 0.9;
126
  }
127
- .similarity-badge {
128
- padding: 0.2em 0.5em;
129
- border-radius: 4px;
130
- background: #e3f2fd;
131
- color: #1e88e5;
132
- font-weight: 500;
133
  }
134
  </style>
135
  """, unsafe_allow_html=True)
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  try:
138
  search_system = init_search_system()
139
  except Exception as e:
140
- st.error(f"Failed to initialize search system: {str(e)}")
141
  st.stop()
142
 
 
143
  st.title("πŸ” Semantic Search Engine")
144
-
145
- # Search input with sanitization.
146
- query = st.text_input("Enter your search query:",
147
- placeholder="Search documents...",
148
- max_chars=200)
149
-
150
  if query:
151
- try:
152
- clean_query = sanitize_query(query)
153
- if not clean_query:
154
- st.warning("Please enter a valid search query")
155
- st.stop()
156
-
157
- with st.spinner("πŸ” Searching through documents..."):
158
- start_time = time.time()
159
- results = search_system.search(clean_query, top_k=5)
160
  search_duration = time.time() - start_time
161
 
162
- if results:
163
- st.subheader(f"Top Results (Search completed in {search_duration:.2f}s)")
164
-
165
- for res in results:
166
- with st.expander(f"{res.get('title', 'Untitled')}"):
167
- col1, col2 = st.columns([3, 1])
168
- with col1:
169
- st.markdown(f"**Summary:** {res.get('summary', '')}")
170
- with col2:
171
- st.markdown(
172
- f"<div class='similarity-badge'>"
173
- f"Confidence: {res.get('similarity', 0):.1%}"
174
- f"</div>",
175
- unsafe_allow_html=True
176
- )
177
- st.progress(float(res.get("similarity", 0)))
178
- if res.get("source"):
179
- # Validate URL before displaying.
180
- if is_valid_url(res["source"]):
181
- st.markdown(f"<a class='source-link' href='{res['source']}' target='_blank'>Source Link</a>", unsafe_allow_html=True)
182
- else:
183
- st.warning("Invalid source URL")
184
  else:
185
- st.warning("No matching documents found")
186
- st.info("Try these tips:")
187
- st.markdown("""
188
- - Use more specific keywords
189
- - Check your spelling
190
- - Avoid special characters
191
- """)
192
- except Exception as e:
193
- logger.error(f"Search failed: {str(e)}")
194
- st.error("Search operation failed. Please try again.")
 
 
195
 
196
- # Sidebar: System monitoring and diagnostics.
 
 
 
 
197
  with st.sidebar:
198
- st.subheader("πŸ“Š System Status")
 
199
  col1, col2 = st.columns(2)
200
  with col1:
201
- st.metric("Total Documents",
202
- f"{search_system.metadata_mgr.total_docs:,}",
203
- help="Total indexed documents in the system")
204
  with col2:
205
- if hasattr(search_system, 'index'):
206
- st.metric("FAISS Vectors",
207
- f"{search_system.index.ntotal:,}",
208
- help="Number of vectors in the FAISS index")
209
- st.metric("Active Memory",
210
- f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
211
- help="Current memory usage by the application")
212
-
213
- if diagnostics_available:
214
  add_diagnostics_ui(search_system)
215
- else:
216
- st.warning("Diagnostics module not available")
217
 
218
- if st.button("🩺 Run Health Check"):
219
- try:
220
- system_stats = {
221
- "shards_loaded": 1 if hasattr(search_system, 'index') else 0,
222
- "metadata_records": search_system.metadata_mgr.total_docs,
223
- "memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB",
224
- "active_threads": threading.active_count(),
225
- "system_load": f"{os.getloadavg()[0]:.2f}"
226
- }
227
- st.json(system_stats)
228
- except Exception as e:
229
- st.error(f"Health check failed: {str(e)}")
230
 
231
- if st.button("♻️ Clear Cache"):
232
- try:
233
- st.cache_resource.clear()
234
- st.experimental_rerun()
235
- except Exception as e:
236
- st.error(f"Cache clearance failed: {str(e)}")
237
 
238
  if __name__ == "__main__":
239
- main()
 
5
  import os
6
  import sys
7
  import psutil
8
+ from urllib.parse import quote
9
  import threading
10
  import re
11
+ from pathlib import Path
12
 
13
  # Configure logging
14
  logging.basicConfig(
 
19
  logger = logging.getLogger("SemanticSearchApp")
20
 
21
  # Security validation functions
22
+ def is_valid_url(url):
23
+ """Validate URL format and safety"""
24
  try:
25
  result = urlparse(url)
26
  return all([result.scheme, result.netloc])
27
+ except:
28
  return False
29
 
30
+ def sanitize_query(query):
31
+ """Sanitize user input to prevent injection attacks"""
32
  try:
33
+ return re.sub(r'[^\w\s-]', '', query)[:256]
 
 
34
  except Exception as e:
35
  logger.error(f"Query sanitization failed: {str(e)}")
36
+ return query[:256]
37
+
38
+ def add_diagnostics_ui(search_system):
39
+ """Enhanced diagnostics with accurate path handling"""
 
 
 
 
 
 
 
 
40
  with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
41
+ col1, col2 = st.columns(2)
42
+
43
+ # Metadata validation
44
+ with col1:
45
+ st.subheader("πŸ“‚ Metadata Validation")
46
+ metadata_dir = search_system.metadata_mgr.metadata_dir
47
+ if metadata_dir.exists():
48
+ parquet_files = list(metadata_dir.glob("*.parquet"))
49
+ status = len(parquet_files) > 0
50
+ st.write(f"Directory: `{metadata_dir}`")
51
+ st.write(f"Parquet Files: {len(parquet_files)}")
52
+ st.success("βœ… Valid metadata" if status else "❌ No parquet files found")
53
+ else:
54
+ st.error("Metadata directory not found")
55
+
56
+ # FAISS validation
57
+ with col2:
58
+ st.subheader("πŸ“š FAISS Validation")
59
+ faiss_path = Path("combined_index.faiss")
60
+ if faiss_path.exists():
61
+ st.write(f"Index Path: `{faiss_path}`")
62
+ if hasattr(search_system, 'index') and search_system.index:
63
+ st.success(f"βœ… Index loaded")
64
+ st.write(f"Vectors: {search_system.index.ntotal:,}")
65
+ else:
66
+ st.error("❌ Index not loaded")
67
+ else:
68
+ st.error("FAISS index not found")
69
+
70
+ # System resources
71
+ st.subheader("πŸ’» System Resources")
72
+ col_res1, col_res2 = st.columns(2)
73
+ with col_res1:
74
+ mem_usage = psutil.Process().memory_info().rss // 1024 ** 2
75
+ st.metric("Memory Usage", f"{mem_usage} MB")
76
+
77
+ with col_res2:
78
+ cpu_usage = psutil.cpu_percent()
79
+ status_color = "#ff0000" if cpu_usage > 80 else "#00ff00"
80
+ st.markdown(f"<span style='color:{status_color}'>CPU: {cpu_usage}%</span>",
81
+ unsafe_allow_html=True)
82
 
83
  def main():
84
  st.set_page_config(
 
87
  layout="wide"
88
  )
89
 
90
+ # Custom CSS styling
 
 
 
 
 
 
 
 
 
 
 
 
91
  st.markdown("""
92
  <style>
93
+ .metric-box {
94
+ padding: 15px;
95
+ border-radius: 8px;
96
+ background: #f8f9fa;
97
+ margin: 10px 0;
98
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 
 
 
 
 
 
 
 
99
  }
100
+ .result-card {
101
+ padding: 15px;
102
+ border-left: 4px solid #1e88e5;
103
+ margin: 10px 0;
104
+ background: #fff;
 
105
  }
106
  </style>
107
  """, unsafe_allow_html=True)
108
 
109
+ # Initialize search system
110
+ @st.cache_resource(ttl=3600)
111
+ def init_search_system():
112
+ try:
113
+ system = OptimizedSemanticSearch()
114
+ system.initialize_system()
115
+ logger.info("Search system initialized")
116
+ return system
117
+ except Exception as e:
118
+ logger.error(f"Initialization failed: {str(e)}")
119
+ st.error("System initialization error")
120
+ st.stop()
121
+
122
  try:
123
  search_system = init_search_system()
124
  except Exception as e:
125
+ st.error(f"Critical error: {str(e)}")
126
  st.stop()
127
 
128
+ # Main UI components
129
  st.title("πŸ” Semantic Search Engine")
130
+ query = st.text_input("Search knowledge base:", placeholder="Enter your query...")
131
+
 
 
 
 
132
  if query:
133
+ clean_query = sanitize_query(query)
134
+ if not clean_query:
135
+ st.warning("Invalid query format")
136
+ st.stop()
137
+
138
+ with st.spinner("Analyzing documents..."):
139
+ start_time = time.time()
140
+ try:
141
+ results = search_system.search(clean_query, 5)
142
  search_duration = time.time() - start_time
143
 
144
+ if not results:
145
+ st.warning("No matches found")
146
+ st.info("Try refining your search terms")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  else:
148
+ st.subheader(f"Top Results ({search_duration:.2f}s)")
149
+ for res in results:
150
+ with st.expander(res.get('title', 'Untitled')):
151
+ st.markdown(f"**Summary**: {res.get('summary', '')}")
152
+ similarity = res.get('similarity', 0)
153
+ st.progress(similarity)
154
+ st.markdown(f"**Confidence**: {similarity:.1%}")
155
+ source = res.get('source', '')
156
+ if source and is_valid_url(source):
157
+ st.markdown(f"[View Source]({source})")
158
+ elif res.get('title'):
159
+ st.markdown(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})")
160
 
161
+ except Exception as e:
162
+ logger.error(f"Search error: {str(e)}")
163
+ st.error("Search operation failed")
164
+
165
+ # System status sidebar
166
  with st.sidebar:
167
+ st.subheader("πŸ“Š System Health")
168
+
169
  col1, col2 = st.columns(2)
170
  with col1:
171
+ st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}")
172
+
 
173
  with col2:
174
+ vectors = search_system.index.ntotal if hasattr(search_system, 'index') else 0
175
+ st.metric("Vectors", f"{vectors:,}")
176
+
177
+ # Diagnostics section
178
+ if st.checkbox("Show advanced diagnostics"):
 
 
 
 
179
  add_diagnostics_ui(search_system)
 
 
180
 
181
+ # System monitoring
182
+ st.subheader("βš™οΈ Monitoring")
183
+ with st.expander("Performance"):
184
+ mem = psutil.virtual_memory()
185
+ st.write(f"Memory: {mem.percent}% used")
186
+ st.write(f"CPU Cores: {psutil.cpu_count()}")
187
+ st.write(f"Active threads: {threading.active_count()}")
 
 
 
 
 
188
 
189
+ if st.button("πŸ”„ Refresh System"):
190
+ st.cache_resource.clear()
191
+ st.rerun()
 
 
 
192
 
193
  if __name__ == "__main__":
194
+ main()