import streamlit as st import schedule import time import requests import threading from search_utils import SemanticSearch import logging import time import psutil from urllib.parse import quote import threading import re from pathlib import Path from urllib.parse import urlparse def ping_server(): try: print("Pinging server") response = requests.get("https://testys-semantic-search.hf.space") except requests.exceptions.RequestException as e: print("Server is down") schedule.every(10).minutes.do(ping_server) def run_schedule(): while True: schedule.run_pending() time.sleep(1) thread = threading.Thread(target=run_schedule) thread.daemon = True thread.start() # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()] ) logger = logging.getLogger("SemanticSearchApp") # Security validation functions def is_valid_url(url): """Validate URL format and safety""" try: result = urlparse(url) return all([result.scheme, result.netloc]) except: return False def sanitize_query(query): """Sanitize user input to prevent injection attacks""" try: return re.sub(r'[^\w\s-]', '', query)[:256] except Exception as e: logger.error(f"Query sanitization failed: {str(e)}") return query[:256] def add_diagnostics_ui(search_system): """Enhanced diagnostics with accurate path handling""" with st.sidebar.expander("🔧 Diagnostics", expanded=False): col1, col2 = st.columns(2) # Metadata validation with col1: st.subheader("📂 Metadata Validation") metadata_dir = search_system.metadata_mgr.metadata_path if metadata_dir.exists(): parquet_files = [metadata_dir] status = len(parquet_files) > 0 st.write(f"Directory: `{metadata_dir}`") st.write(f"Parquet Files: {len(parquet_files)}") st.success("✅ Valid metadata" if status else "❌ No parquet files found") else: st.error("Metadata directory not found") # FAISS validation with col2: st.subheader("📚 FAISS Validation") faiss_path = search_system.shard_dir if faiss_path.exists(): st.write(f"Index Path: `{faiss_path}`") st.success(f"✅ Index loaded") st.write(f"Vectors: {search_system.total_vectors}") else: st.error("FAISS index not found") # System resources st.subheader("💻 System Resources") col_res1, col_res2 = st.columns(2) with col_res1: mem_usage = psutil.Process().memory_info().rss // 1024 ** 2 st.metric("Memory Usage", f"{mem_usage} MB") with col_res2: cpu_usage = psutil.cpu_percent() status_color = "#ff0000" if cpu_usage > 80 else "#00ff00" st.markdown(f"CPU: {cpu_usage}%", unsafe_allow_html=True) def main(): st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # Custom CSS styling st.markdown(""" """, unsafe_allow_html=True) # Initialize search system @st.cache_resource(ttl=3600) def init_search_system(): try: system = SemanticSearch() system.initialize_system() logger.info("Search system initialized") return system except Exception as e: logger.error(f"Initialization failed: {str(e)}") st.error("System initialization error") st.stop() try: search_system = init_search_system() except Exception as e: st.error(f"Critical error: {str(e)}") st.stop() # Main UI components st.title("🔍 Academics Research Semantics Search Engine") query = st.text_input("Search knowledge base:", placeholder="Enter your query...") if query: clean_query = sanitize_query(query) if not clean_query: st.warning("Invalid query format") st.stop() with st.spinner("Analyzing documents..."): start_time = time.time() try: results = search_system.search(clean_query, 5) search_duration = time.time() - start_time if results.empty: st.warning("No matches found") st.info("Try refining your search terms") else: st.subheader(f"Top Results ({search_duration:.2f}s)") for index, res in results.iterrows(): # Use iterrows to iterate through rows if it's a DataFrame logger.info(f"Results: {res}") with st.expander(res["title"]): st.markdown(f"**Summary**: {res['summary']}") similarity = res['similarity'] st.progress(similarity) st.markdown(f"**Confidence**: {similarity:.1%}") st.markdown(f"**Authors**: {res['authors']}") source = res['source'] st.write("**Sources**:") for url in res['source']: st.markdown(f"- [{url}]({url})") st.write(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})") except Exception as e: logger.error(f"Search error: {str(e)}") st.error("Search operation failed") # System status sidebar with st.sidebar: st.subheader("📊 System Health") col1, col2 = st.columns(2) with col1: st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}") with col2: vectors = search_system.total_vectors st.metric("Vectors", f"{vectors:,}") # Diagnostics section if st.checkbox("Show advanced diagnostics"): add_diagnostics_ui(search_system) # System monitoring st.subheader("⚙️ Monitoring") with st.expander("Performance"): mem = psutil.virtual_memory() st.write(f"Memory: {mem.percent}% used") st.write(f"CPU Cores: {psutil.cpu_count()}") st.write(f"Active threads: {threading.active_count()}") if st.button("🔄 Refresh System"): st.cache_resource.clear() st.rerun() # Footer st.markdown(""" """, unsafe_allow_html=True) if __name__ == "__main__": main()