import streamlit as st from search_utils import SemanticSearch import logging import time import os import sys import psutil # Added missing import from urllib.parse import urlparse import threading import re # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() ] ) logger = logging.getLogger("SemanticSearchApp") # Security validation functions def is_valid_url(url): """Validate URL format and safety""" try: result = urlparse(url) if not all([result.scheme, result.netloc]): return False # Add additional security checks here return True except: return False def sanitize_query(query): """Sanitize user input to prevent injection attacks""" try: # Remove non-alphanumeric characters except spaces and hyphens clean_query = re.sub(r'[^\w\s-]', '', query) return clean_query[:256] # Truncate to prevent long queries except Exception as e: logger.error(f"Query sanitization failed: {str(e)}") return query[:256] # Fallback truncation # Diagnostics integration try: from diagnostics import diagnose_parquet_files diagnostics_available = True except ImportError: diagnostics_available = False logger.warning("Diagnostics module not available") def add_diagnostics_ui(search_system): """Enhanced diagnostics UI with proper directory checks""" with st.sidebar.expander("🔧 Diagnostics", expanded=False): if st.button("Run Full System Check"): with st.spinner("Performing comprehensive system check..."): # Create columns for organized display col1, col2 = st.columns(2) # Get actual paths from the search system metadata_dir = search_system.metadata_mgr.shard_dir faiss_dir = search_system.shard_dir # From SemanticSearch class with col1: # Metadata directory check st.subheader("📂 Metadata Validation") if metadata_dir.exists(): # Check directory structure dir_status = any(metadata_dir.glob("*.parquet")) st.write(f"Directory: `{metadata_dir}`") st.write(f"Parquet Files Found: {'✅' if dir_status else '❌'}") # Check individual files if diagnose_parquet_files(str(metadata_dir)): st.success("✅ Metadata shards valid") else: st.error("❌ Metadata issues detected") else: st.error("Metadata directory not found") with col2: # FAISS index check st.subheader("📚 FAISS Validation") if faiss_dir.exists(): index_files = list(faiss_dir.glob("*.index")) st.write(f"Directory: `{faiss_dir}`") st.write(f"Index Files Found: {len(index_files)}") if len(search_system.index_shards) > 0: st.success(f"✅ {len(search_system.index_shards)} FAISS shards loaded") st.write(f"Total Vectors: {sum(s.ntotal for s in search_system.index_shards):,}") else: st.error("❌ No FAISS shards loaded") else: st.error("FAISS directory not found") # System resource check st.subheader("💻 System Resources") col_res1, col_res2 = st.columns(2) with col_res1: st.metric("Memory Usage", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB", help="Current process memory usage") with col_res2: st.metric("CPU Utilization", f"{psutil.cpu_percent()}%", help="Total system CPU usage") def main(): st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # Initialize search system with enhanced caching @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...") def init_search_system(): try: system = SemanticSearch() system.initialize_system() logger.info("Search system initialized successfully") return system except Exception as e: logger.error(f"System initialization failed: {str(e)}") st.error("Critical system initialization error. Check logs.") st.stop() # Custom CSS with enhanced visual design st.markdown(""" """, unsafe_allow_html=True) try: search_system = init_search_system() except Exception as e: st.error(f"Failed to initialize search system: {str(e)}") st.stop() # Main UI components st.title("🔍 Semantic Search Engine") # Search input with sanitization query = st.text_input("Enter your search query:", placeholder="Search documents...", max_chars=200) if query: try: # Sanitize and validate query clean_query = sanitize_query(query) if not clean_query: st.warning("Please enter a valid search query") st.stop() with st.spinner("🔍 Searching through documents..."): start_time = time.time() results = search_system.search(clean_query, 5) search_duration = time.time() - start_time if not results.empty: st.subheader(f"Top Results ({search_duration:.2f}s)") # Visualize results with enhanced formatting for _, row in results.iterrows(): with st.expander(f"{row['title']}"): # Similarity visualization col1, col2 = st.columns([3, 1]) with col1: st.markdown(f"**Summary**: {row['summary']}") with col2: st.markdown( f"