import streamlit as st from search_utils import SemanticSearch import logging import time import os import sys import psutil # Added missing import from urllib.parse import urlparse import threading import re # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler() ] ) logger = logging.getLogger("SemanticSearchApp") # Security validation functions def is_valid_url(url): """Validate URL format and safety""" try: result = urlparse(url) if not all([result.scheme, result.netloc]): return False # Add additional security checks here return True except: return False def sanitize_query(query): """Sanitize user input to prevent injection attacks""" try: # Remove non-alphanumeric characters except spaces and hyphens clean_query = re.sub(r'[^\w\s-]', '', query) return clean_query[:256] # Truncate to prevent long queries except Exception as e: logger.error(f"Query sanitization failed: {str(e)}") return query[:256] # Fallback truncation # Diagnostics integration try: from diagnostics import diagnose_parquet_files diagnostics_available = True except ImportError: diagnostics_available = False logger.warning("Diagnostics module not available") def add_diagnostics_ui(search_system): """Enhanced diagnostics UI with proper directory checks""" with st.sidebar.expander("🔧 Diagnostics", expanded=False): if st.button("Run Full System Check"): with st.spinner("Performing comprehensive system check..."): # Create columns for organized display col1, col2 = st.columns(2) # Get actual paths from the search system metadata_dir = search_system.metadata_mgr.shard_dir faiss_dir = search_system.shard_dir # From SemanticSearch class with col1: # Metadata directory check st.subheader("📂 Metadata Validation") if metadata_dir.exists(): # Check directory structure dir_status = any(metadata_dir.glob("*.parquet")) st.write(f"Directory: `{metadata_dir}`") st.write(f"Parquet Files Found: {'✅' if dir_status else '❌'}") # Check individual files if diagnose_parquet_files(str(metadata_dir)): st.success("✅ Metadata shards valid") else: st.error("❌ Metadata issues detected") else: st.error("Metadata directory not found") with col2: # FAISS index check st.subheader("📚 FAISS Validation") if faiss_dir.exists(): index_files = list(faiss_dir.glob("*.index")) st.write(f"Directory: `{faiss_dir}`") st.write(f"Index Files Found: {len(index_files)}") if len(search_system.index_shards) > 0: st.success(f"✅ {len(search_system.index_shards)} FAISS shards loaded") st.write(f"Total Vectors: {sum(s.ntotal for s in search_system.index_shards):,}") else: st.error("❌ No FAISS shards loaded") else: st.error("FAISS directory not found") # System resource check st.subheader("💻 System Resources") col_res1, col_res2 = st.columns(2) with col_res1: st.metric("Memory Usage", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB", help="Current process memory usage") with col_res2: st.metric("CPU Utilization", f"{psutil.cpu_percent()}%", help="Total system CPU usage") def main(): st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # Initialize search system with enhanced caching @st.cache_resource(ttl=3600, show_spinner="Initializing search engine...") def init_search_system(): try: system = SemanticSearch() system.initialize_system() logger.info("Search system initialized successfully") return system except Exception as e: logger.error(f"System initialization failed: {str(e)}") st.error("Critical system initialization error. Check logs.") st.stop() # Custom CSS with enhanced visual design st.markdown(""" """, unsafe_allow_html=True) try: search_system = init_search_system() except Exception as e: st.error(f"Failed to initialize search system: {str(e)}") st.stop() # Main UI components st.title("🔍 Semantic Search Engine") # Search input with sanitization query = st.text_input("Enter your search query:", placeholder="Search documents...", max_chars=200) if query: try: # Sanitize and validate query clean_query = sanitize_query(query) if not clean_query: st.warning("Please enter a valid search query") st.stop() with st.spinner("🔍 Searching through documents..."): start_time = time.time() results = search_system.search(clean_query, 5) search_duration = time.time() - start_time if not results.empty: st.subheader(f"Top Results ({search_duration:.2f}s)") # Visualize results with enhanced formatting for _, row in results.iterrows(): with st.expander(f"{row['title']}"): # Similarity visualization col1, col2 = st.columns([3, 1]) with col1: st.markdown(f"**Summary**: {row['summary']}") with col2: st.markdown( f"
" f"Confidence: {row['similarity']:.1%}" f"
", unsafe_allow_html=True ) st.progress(float(row['similarity'])) # Safe URL handling if row['source']: src_lower = row['source'].lower() if "cord" in src_lower: # Link to a CORD-19 search page; adjust the URL as needed url = "https://www.semanticscholar.org/search" elif "arxiv" in src_lower: # Link to the arXiv search page url = "https://arxiv.org/search/" else: url = row['source'] if is_valid_url(url): st.markdown( f"" f"🌐 View Source", unsafe_allow_html=True ) else: st.warning("Invalid source URL") else: st.warning("No matching documents found") st.info("Try these tips:") st.markdown(""" - Use more specific keywords - Check your spelling - Avoid special characters """) except Exception as e: logger.error(f"Search failed: {str(e)}") st.error("Search operation failed. Please try again.") # System monitoring sidebar with st.sidebar: st.subheader("📊 System Status") col1, col2 = st.columns(2) with col1: st.metric("Total Documents", f"{search_system.metadata_mgr.total_docs:,}", help="Total indexed documents in system") with col2: st.metric("FAISS Shards", len(search_system.index_shards), help="Number of loaded vector index shards") st.metric("Active Memory", f"{psutil.Process().memory_info().rss // 1024 ** 2} MB", help="Current memory usage by the application") # Diagnostics section if diagnostics_available: add_diagnostics_ui(search_system) else: st.warning("Diagnostics module not available") # Health check with error handling if st.button("🩺 Run Health Check"): try: system_stats = { "shards_loaded": len(search_system.index_shards), "metadata_records": search_system.metadata_mgr.total_docs, "memory_usage": f"{psutil.Process().memory_info().rss // 1024 ** 2} MB", "active_threads": threading.active_count(), "system_load": f"{os.getloadavg()[0]:.2f}" } st.json(system_stats) except Exception as e: st.error(f"Health check failed: {str(e)}") # Cache management if st.button("♻️ Clear Cache"): try: st.cache_resource.clear() st.rerun() except Exception as e: st.error(f"Cache clearance failed: {str(e)}") if __name__ == "__main__": main()