Spaces:
Running
Running
import streamlit as st | |
import schedule | |
import time | |
import requests | |
import threading | |
from search_utils import SemanticSearch | |
import logging | |
import time | |
import psutil | |
from urllib.parse import quote | |
import threading | |
import re | |
from pathlib import Path | |
from urllib.parse import urlparse | |
def ping_server(): | |
try: | |
print("Pinging server") | |
response = requests.get("https://testys-semantic-search.hf.space") | |
except requests.exceptions.RequestException as e: | |
print("Server is down") | |
schedule.every(10).minutes.do(ping_server) | |
def run_schedule(): | |
while True: | |
schedule.run_pending() | |
time.sleep(1) | |
thread = threading.Thread(target=run_schedule) | |
thread.daemon = True | |
thread.start() | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[logging.StreamHandler()] | |
) | |
logger = logging.getLogger("SemanticSearchApp") | |
# Security validation functions | |
def is_valid_url(url): | |
"""Validate URL format and safety""" | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except: | |
return False | |
def sanitize_query(query): | |
"""Sanitize user input to prevent injection attacks""" | |
try: | |
return re.sub(r'[^\w\s-]', '', query)[:256] | |
except Exception as e: | |
logger.error(f"Query sanitization failed: {str(e)}") | |
return query[:256] | |
def add_diagnostics_ui(search_system): | |
"""Enhanced diagnostics with accurate path handling""" | |
with st.sidebar.expander("π§ Diagnostics", expanded=False): | |
col1, col2 = st.columns(2) | |
# Metadata validation | |
with col1: | |
st.subheader("π Metadata Validation") | |
metadata_dir = search_system.metadata_mgr.metadata_path | |
if metadata_dir.exists(): | |
parquet_files = [metadata_dir] | |
status = len(parquet_files) > 0 | |
st.write(f"Directory: `{metadata_dir}`") | |
st.write(f"Parquet Files: {len(parquet_files)}") | |
st.success("β Valid metadata" if status else "β No parquet files found") | |
else: | |
st.error("Metadata directory not found") | |
# FAISS validation | |
with col2: | |
st.subheader("π FAISS Validation") | |
faiss_path = search_system.shard_dir | |
if faiss_path.exists(): | |
st.write(f"Index Path: `{faiss_path}`") | |
st.success(f"β Index loaded") | |
st.write(f"Vectors: {search_system.total_vectors}") | |
else: | |
st.error("FAISS index not found") | |
# System resources | |
st.subheader("π» System Resources") | |
col_res1, col_res2 = st.columns(2) | |
with col_res1: | |
mem_usage = psutil.Process().memory_info().rss // 1024 ** 2 | |
st.metric("Memory Usage", f"{mem_usage} MB") | |
with col_res2: | |
cpu_usage = psutil.cpu_percent() | |
status_color = "#ff0000" if cpu_usage > 80 else "#00ff00" | |
st.markdown(f"<span style='color:{status_color}'>CPU: {cpu_usage}%</span>", | |
unsafe_allow_html=True) | |
def main(): | |
st.set_page_config( | |
page_title="Semantic Search Engine", | |
page_icon="π", | |
layout="wide" | |
) | |
# Custom CSS styling | |
st.markdown(""" | |
<style> | |
.metric-box { | |
padding: 15px; | |
border-radius: 8px; | |
background: #f8f9fa; | |
margin: 10px 0; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
} | |
.result-card { | |
padding: 15px; | |
border-left: 4px solid #1e88e5; | |
margin: 10px 0; | |
background: #fff; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Initialize search system | |
def init_search_system(): | |
try: | |
system = SemanticSearch() | |
system.initialize_system() | |
logger.info("Search system initialized") | |
return system | |
except Exception as e: | |
logger.error(f"Initialization failed: {str(e)}") | |
st.error("System initialization error") | |
st.stop() | |
try: | |
search_system = init_search_system() | |
except Exception as e: | |
st.error(f"Critical error: {str(e)}") | |
st.stop() | |
# Main UI components | |
st.title("π Academics Research Semantics Search Engine") | |
query = st.text_input("Search knowledge base:", placeholder="Enter your query...") | |
if query: | |
clean_query = sanitize_query(query) | |
if not clean_query: | |
st.warning("Invalid query format") | |
st.stop() | |
with st.spinner("Analyzing documents..."): | |
start_time = time.time() | |
try: | |
results = search_system.search(clean_query, 5) | |
search_duration = time.time() - start_time | |
if results.empty: | |
st.warning("No matches found") | |
st.info("Try refining your search terms") | |
else: | |
st.subheader(f"Top Results ({search_duration:.2f}s)") | |
for index, res in results.iterrows(): # Use iterrows to iterate through rows if it's a DataFrame | |
logger.info(f"Results: {res}") | |
with st.expander(res["title"]): | |
st.markdown(f"**Summary**: {res['summary']}") | |
similarity = res['similarity'] | |
st.progress(similarity) | |
st.markdown(f"**Confidence**: {similarity:.1%}") | |
st.markdown(f"**Authors**: {res['authors']}") | |
source = res['source'] | |
st.write("**Sources**:") | |
for url in res['source']: | |
st.markdown(f"- [{url}]({url})") | |
st.write(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})") | |
except Exception as e: | |
logger.error(f"Search error: {str(e)}") | |
st.error("Search operation failed") | |
# System status sidebar | |
with st.sidebar: | |
st.subheader("π System Health") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}") | |
with col2: | |
vectors = search_system.total_vectors | |
st.metric("Vectors", f"{vectors:,}") | |
# Diagnostics section | |
if st.checkbox("Show advanced diagnostics"): | |
add_diagnostics_ui(search_system) | |
# System monitoring | |
st.subheader("βοΈ Monitoring") | |
with st.expander("Performance"): | |
mem = psutil.virtual_memory() | |
st.write(f"Memory: {mem.percent}% used") | |
st.write(f"CPU Cores: {psutil.cpu_count()}") | |
st.write(f"Active threads: {threading.active_count()}") | |
if st.button("π Refresh System"): | |
st.cache_resource.clear() | |
st.rerun() | |
# Footer | |
st.markdown(""" | |
<footer style="text-align: center; padding: 1rem; margin-top: 2rem; border-top: 1px solid #ddd;"> | |
<small>© 2025 Joel Adesanya's MSc Project.</small> | |
</footer> | |
""", unsafe_allow_html=True) | |
if __name__ == "__main__": | |
main() | |