semantic-search / app.py
Testys's picture
Update app.py
5240191 verified
raw
history blame
6.68 kB
import streamlit as st
from search_utils import SemanticSearch
import logging
import time
import psutil
from urllib.parse import quote
import threading
import re
from pathlib import Path
from urllib.parse import urlparse
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("SemanticSearchApp")
# Security validation functions
def is_valid_url(url):
"""Validate URL format and safety"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def sanitize_query(query):
"""Sanitize user input to prevent injection attacks"""
try:
return re.sub(r'[^\w\s-]', '', query)[:256]
except Exception as e:
logger.error(f"Query sanitization failed: {str(e)}")
return query[:256]
def add_diagnostics_ui(search_system):
"""Enhanced diagnostics with accurate path handling"""
with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
col1, col2 = st.columns(2)
# Metadata validation
with col1:
st.subheader("πŸ“‚ Metadata Validation")
metadata_dir = search_system.metadata_mgr.metadata_path
if metadata_dir.exists():
parquet_files = list(metadata_dir)
status = len(parquet_files) > 0
st.write(f"Directory: `{metadata_dir}`")
st.write(f"Parquet Files: {len(parquet_files)}")
st.success("βœ… Valid metadata" if status else "❌ No parquet files found")
else:
st.error("Metadata directory not found")
# FAISS validation
with col2:
st.subheader("πŸ“š FAISS Validation")
faiss_path = search_system.shard_dir
if faiss_path.exists():
st.write(f"Index Path: `{faiss_path}`")
st.success(f"βœ… Index loaded")
st.write(f"Vectors: {search_system.total_vectors}")
else:
st.error("FAISS index not found")
# System resources
st.subheader("πŸ’» System Resources")
col_res1, col_res2 = st.columns(2)
with col_res1:
mem_usage = psutil.Process().memory_info().rss // 1024 ** 2
st.metric("Memory Usage", f"{mem_usage} MB")
with col_res2:
cpu_usage = psutil.cpu_percent()
status_color = "#ff0000" if cpu_usage > 80 else "#00ff00"
st.markdown(f"<span style='color:{status_color}'>CPU: {cpu_usage}%</span>",
unsafe_allow_html=True)
def main():
st.set_page_config(
page_title="Semantic Search Engine",
page_icon="πŸ”",
layout="wide"
)
# Custom CSS styling
st.markdown("""
<style>
.metric-box {
padding: 15px;
border-radius: 8px;
background: #f8f9fa;
margin: 10px 0;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.result-card {
padding: 15px;
border-left: 4px solid #1e88e5;
margin: 10px 0;
background: #fff;
}
</style>
""", unsafe_allow_html=True)
# Initialize search system
@st.cache_resource(ttl=3600)
def init_search_system():
try:
system = SemanticSearch()
system.initialize_system()
logger.info("Search system initialized")
return system
except Exception as e:
logger.error(f"Initialization failed: {str(e)}")
st.error("System initialization error")
st.stop()
try:
search_system = init_search_system()
except Exception as e:
st.error(f"Critical error: {str(e)}")
st.stop()
# Main UI components
st.title("πŸ” Research Paper Semantic Search Engine")
query = st.text_input("Search knowledge base:", placeholder="Enter your query...")
if query:
clean_query = sanitize_query(query)
if not clean_query:
st.warning("Invalid query format")
st.stop()
with st.spinner("Analyzing documents..."):
start_time = time.time()
try:
results = search_system.search(clean_query, 5)
search_duration = time.time() - start_time
if results.empty:
st.warning("No matches found")
st.info("Try refining your search terms")
else:
st.subheader(f"Top Results ({search_duration:.2f}s)")
for index, res in results.iterrows(): # Use iterrows to iterate through rows if it's a DataFrame
logger.info(f"Results: {res}")
with st.expander(res["title"]):
st.markdown(f"**Summary**: {res['summary']}")
similarity = res['similarity']
st.progress(similarity)
st.markdown(f"**Confidence**: {similarity:.1%}")
st.markdown(f"**Authors**: {res['authors']}")
source = res['source']
st.write("**Sources**:")
for url in res['source']:
st.markdown(f"- [{url}]({url})")
st.write(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})")
except Exception as e:
logger.error(f"Search error: {str(e)}")
st.error("Search operation failed")
# System status sidebar
with st.sidebar:
st.subheader("πŸ“Š System Health")
col1, col2 = st.columns(2)
with col1:
st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}")
with col2:
vectors = search_system.total_vectors
st.metric("Vectors", f"{vectors:,}")
# Diagnostics section
if st.checkbox("Show advanced diagnostics"):
add_diagnostics_ui(search_system)
# System monitoring
st.subheader("βš™οΈ Monitoring")
with st.expander("Performance"):
mem = psutil.virtual_memory()
st.write(f"Memory: {mem.percent}% used")
st.write(f"CPU Cores: {psutil.cpu_count()}")
st.write(f"Active threads: {threading.active_count()}")
if st.button("πŸ”„ Refresh System"):
st.cache_resource.clear()
st.rerun()
if __name__ == "__main__":
main()