import streamlit as st from search_utils import OptimizedSemanticSearch import logging import time import os import sys import psutil from urllib.parse import quote import threading import re from pathlib import Path # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()] ) logger = logging.getLogger("SemanticSearchApp") # Security validation functions def is_valid_url(url): """Validate URL format and safety""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def sanitize_query(query): """Sanitize user input to prevent injection attacks""" try: return re.sub(r'[^\w\s-]', '', query)[:256] except Exception as e: logger.error(f"Query sanitization failed: {str(e)}") return query[:256] def add_diagnostics_ui(search_system): """Enhanced diagnostics with accurate path handling""" with st.sidebar.expander("🔧 Diagnostics", expanded=False): col1, col2 = st.columns(2) # Metadata validation with col1: st.subheader("📂 Metadata Validation") metadata_dir = search_system.metadata_mgr.metadata_dir if metadata_dir.exists(): parquet_files = list(metadata_dir.glob("*.parquet")) status = len(parquet_files) > 0 st.write(f"Directory: `{metadata_dir}`") st.write(f"Parquet Files: {len(parquet_files)}") st.success("✅ Valid metadata" if status else "❌ No parquet files found") else: st.error("Metadata directory not found") # FAISS validation with col2: st.subheader("📚 FAISS Validation") faiss_path = Path("combined_index.faiss") if faiss_path.exists(): st.write(f"Index Path: `{faiss_path}`") if hasattr(search_system, 'index') and search_system.index: st.success(f"✅ Index loaded") st.write(f"Vectors: {search_system.index.ntotal:,}") else: st.error("❌ Index not loaded") else: st.error("FAISS index not found") # System resources st.subheader("💻 System Resources") col_res1, col_res2 = st.columns(2) with col_res1: mem_usage = psutil.Process().memory_info().rss // 1024 ** 2 st.metric("Memory Usage", f"{mem_usage} MB") with col_res2: cpu_usage = psutil.cpu_percent() status_color = "#ff0000" if cpu_usage > 80 else "#00ff00" st.markdown(f"CPU: {cpu_usage}%", unsafe_allow_html=True) def main(): st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # Custom CSS styling st.markdown(""" """, unsafe_allow_html=True) # Initialize search system @st.cache_resource(ttl=3600) def init_search_system(): try: system = OptimizedSemanticSearch() system.initialize_system() logger.info("Search system initialized") return system except Exception as e: logger.error(f"Initialization failed: {str(e)}") st.error("System initialization error") st.stop() try: search_system = init_search_system() except Exception as e: st.error(f"Critical error: {str(e)}") st.stop() # Main UI components st.title("🔍 Semantic Search Engine") query = st.text_input("Search knowledge base:", placeholder="Enter your query...") if query: clean_query = sanitize_query(query) if not clean_query: st.warning("Invalid query format") st.stop() with st.spinner("Analyzing documents..."): start_time = time.time() try: results = search_system.search(clean_query, 5) search_duration = time.time() - start_time if not results: st.warning("No matches found") st.info("Try refining your search terms") else: st.subheader(f"Top Results ({search_duration:.2f}s)") for res in results: with st.expander(res.get('title', 'Untitled')): st.markdown(f"**Summary**: {res.get('summary', '')}") similarity = res.get('similarity', 0) st.progress(similarity) st.markdown(f"**Confidence**: {similarity:.1%}") source = res.get('source', '') if source and is_valid_url(source): st.markdown(f"[View Source]({source})") elif res.get('title'): st.markdown(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})") except Exception as e: logger.error(f"Search error: {str(e)}") st.error("Search operation failed") # System status sidebar with st.sidebar: st.subheader("📊 System Health") col1, col2 = st.columns(2) with col1: st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}") with col2: vectors = search_system.index.ntotal if hasattr(search_system, 'index') else 0 st.metric("Vectors", f"{vectors:,}") # Diagnostics section if st.checkbox("Show advanced diagnostics"): add_diagnostics_ui(search_system) # System monitoring st.subheader("⚙️ Monitoring") with st.expander("Performance"): mem = psutil.virtual_memory() st.write(f"Memory: {mem.percent}% used") st.write(f"CPU Cores: {psutil.cpu_count()}") st.write(f"Active threads: {threading.active_count()}") if st.button("🔄 Refresh System"): st.cache_resource.clear() st.rerun() if __name__ == "__main__": main()