import streamlit as st from search_utils import SemanticSearch import logging import time import os import sys import psutil # Added missing import from urllib.parse import urlparse # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() ] ) logger = logging.getLogger("SemanticSearchApp") # Security validation functions def is_valid_url(url): """Validate URL format and safety""" try: result = urlparse(url) if not all([result.scheme, result.netloc]): return False # Add additional security checks here return True except: return False def sanitize_query(query): """Sanitize user input to prevent injection attacks""" return re.sub(r'[^\w\s-]', '', query)[:256] # Diagnostics integration try: from diagnostics import diagnose_parquet_files diagnostics_available = True except ImportError: diagnostics_available = False logger.warning("Diagnostics module not available") def add_diagnostics_ui(search_system): """Enhanced diagnostics UI with system checks""" with st.sidebar.expander("🔧 Diagnostics", expanded=False): if st.button("Run Full System Check"): with st.spinner("Performing comprehensive system check..."): col1, col2 = st.columns(2) # Metadata check with col1: if diagnose_parquet_files("metadata_shards"): st.success("✅ Metadata shards valid") else: st.error("❌ Metadata issues detected") # Index check with col2: if len(search_system.index_shards) > 0: st.success(f"✅ {len(search_system.index_shards)} FAISS shards loaded") else: st.error("❌ No FAISS shards found") # Resource check st.metric("Memory Usage", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB") st.metric("CPU Utilization", f"{psutil.cpu_percent()}%") def main(): st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # Initialize search system with enhanced caching @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...") def init_search_system(): try: system = SemanticSearch() system.initialize_system() logger.info("Search system initialized successfully") return system except Exception as e: logger.error(f"System initialization failed: {str(e)}") st.error("Critical system initialization error. Check logs.") st.stop() # Custom CSS with enhanced visual design st.markdown(""" """, unsafe_allow_html=True) try: search_system = init_search_system() except Exception as e: st.error(f"Failed to initialize search system: {str(e)}") st.stop() # Main UI components st.title("🔍 Semantic Search Engine") # Search input with sanitization query = st.text_input("Enter your search query:", placeholder="Search documents...", max_chars=200) if query: try: # Sanitize and validate query clean_query = sanitize_query(query) if not clean_query: st.warning("Please enter a valid search query") st.stop() with st.spinner("🔍 Searching through documents..."): start_time = time.time() results = search_system.search(clean_query, 5) search_duration = time.time() - start_time if not results.empty: st.subheader(f"Top Results ({search_duration:.2f}s)") # Visualize results with enhanced formatting for _, row in results.iterrows(): with st.expander(f"{row['title']}"): # Similarity visualization col1, col2 = st.columns([3, 1]) with col1: st.markdown(f"**Summary**: {row['summary']}") with col2: st.markdown( f"