import streamlit as st from search_utils import SemanticSearch import logging import time import os import sys import psutil # Added missing import from urllib.parse import urlparse # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() ] ) logger = logging.getLogger("SemanticSearchApp") # Security validation functions def is_valid_url(url): """Validate URL format and safety""" try: result = urlparse(url) if not all([result.scheme, result.netloc]): return False # Add additional security checks here return True except: return False def sanitize_query(query): """Sanitize user input to prevent injection attacks""" return re.sub(r'[^\w\s-]', '', query)[:256] # Diagnostics integration try: from diagnostics import diagnose_parquet_files diagnostics_available = True except ImportError: diagnostics_available = False logger.warning("Diagnostics module not available") def add_diagnostics_ui(search_system): """Enhanced diagnostics UI with system checks""" with st.sidebar.expander("🔧 Diagnostics", expanded=False): if st.button("Run Full System Check"): with st.spinner("Performing comprehensive system check..."): col1, col2 = st.columns(2) # Metadata check with col1: if diagnose_parquet_files("metadata_shards"): st.success("✅ Metadata shards valid") else: st.error("❌ Metadata issues detected") # Index check with col2: if len(search_system.index_shards) > 0: st.success(f"✅ {len(search_system.index_shards)} FAISS shards loaded") else: st.error("❌ No FAISS shards found") # Resource check st.metric("Memory Usage", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB") st.metric("CPU Utilization", f"{psutil.cpu_percent()}%") def main(): st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # Initialize search system with enhanced caching @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...") def init_search_system(): try: system = SemanticSearch() system.initialize_system() logger.info("Search system initialized successfully") return system except Exception as e: logger.error(f"System initialization failed: {str(e)}") st.error("Critical system initialization error. Check logs.") st.stop() # Custom CSS with enhanced visual design st.markdown(""" """, unsafe_allow_html=True) try: search_system = init_search_system() except Exception as e: st.error(f"Failed to initialize search system: {str(e)}") st.stop() # Main UI components st.title("🔍 Semantic Search Engine") # Search input with sanitization query = st.text_input("Enter your search query:", placeholder="Search documents...", max_chars=200) if query: try: # Sanitize and validate query clean_query = sanitize_query(query) if not clean_query: st.warning("Please enter a valid search query") st.stop() with st.spinner("🔍 Searching through documents..."): start_time = time.time() results = search_system.search(clean_query, 5) search_duration = time.time() - start_time if not results.empty: st.subheader(f"Top Results ({search_duration:.2f}s)") # Visualize results with enhanced formatting for _, row in results.iterrows(): with st.expander(f"{row['title']}"): # Similarity visualization col1, col2 = st.columns([3, 1]) with col1: st.markdown(f"**Summary**: {row['summary']}") with col2: st.markdown( f"
" f"Confidence: {row['similarity']:.1%}" f"
", unsafe_allow_html=True ) st.progress(float(row['similarity'])) # Safe URL handling if is_valid_url(row['source']): st.markdown( f"" f"🌐 View Source", unsafe_allow_html=True ) else: st.warning("Invalid source URL") else: st.warning("No matching documents found") st.info("Try these tips:") st.markdown(""" - Use more specific keywords - Check your spelling - Avoid special characters """) except Exception as e: logger.error(f"Search failed: {str(e)}") st.error("Search operation failed. Please try again.") # System monitoring sidebar with st.sidebar: st.subheader("📊 System Status") col1, col2 = st.columns(2) with col1: st.metric("Total Documents", f"{search_system.metadata_mgr.total_docs:,}", help="Total indexed documents in system") with col2: st.metric("FAISS Shards", len(search_system.index_shards), help="Number of loaded vector index shards") st.metric("Active Memory", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB", help="Current memory usage by the application") # Diagnostics section if diagnostics_available: add_diagnostics_ui(search_system) else: st.warning("Diagnostics module not available") # Health check with error handling if st.button("🩺 Run Health Check"): try: system_stats = { "shards_loaded": len(search_system.index_shards), "metadata_records": search_system.metadata_mgr.total_docs, "memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB", "active_threads": threading.active_count(), "system_load": f"{os.getloadavg()[0]:.2f}" } st.json(system_stats) except Exception as e: st.error(f"Health check failed: {str(e)}") # Cache management if st.button("♻️ Clear Cache"): try: st.cache_resource.clear() st.rerun() except Exception as e: st.error(f"Cache clearance failed: {str(e)}") if __name__ == "__main__": main()