File size: 6,612 Bytes
6129cb8
778626d
6ecbc91
 
4abeab7
c056209
cb9097f
 
c056209
90c96a2
cb9097f
6ecbc91
 
 
 
4abeab7
6ecbc91
 
 
 
c056209
 
6ecbc91
 
4abeab7
c056209
6ecbc91
 
c056209
 
cb9097f
c056209
cb9097f
 
c056209
 
 
 
6ecbc91
c056209
 
 
 
 
1193a3b
c056209
 
 
 
 
 
 
 
 
 
 
 
2bff6ee
c056209
 
2bff6ee
ab5f789
c056209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4abeab7
7f849e0
 
 
 
 
 
6129cb8
c056209
7f849e0
 
c056209
 
 
 
 
 
6ecbc91
c056209
 
 
 
 
7f849e0
 
 
6129cb8
c056209
 
 
 
778626d
c056209
 
 
 
 
 
 
 
6ecbc91
 
 
c056209
6ecbc91
2b6cdbb
c056209
778626d
c056209
 
7f849e0
c056209
 
 
 
 
 
 
 
 
6ecbc91
 
4c7de63
c056209
 
6ecbc91
c056209
6d4fe2e
fecb48c
73b3f14
 
 
c056209
 
73b3f14
 
c056209
73b3f14
6d4fe2e
c056209
 
6d4fe2e
c056209
 
7f849e0
c056209
 
6ecbc91
 
c056209
 
6ecbc91
766799c
c056209
 
 
 
6ecbc91
 
c056209
 
 
 
 
 
 
6ecbc91
c056209
 
1193a3b
7f849e0
 
90c96a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import streamlit as st
from search_utils import SemanticSearch
import logging
import time
import psutil
from urllib.parse import quote
import threading
import re
from pathlib import Path
from urllib.parse import urlparse

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("SemanticSearchApp")

# Security validation functions
def is_valid_url(url):
    """Validate URL format and safety"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def sanitize_query(query):
    """Sanitize user input to prevent injection attacks"""
    try:
        return re.sub(r'[^\w\s-]', '', query)[:256]
    except Exception as e:
        logger.error(f"Query sanitization failed: {str(e)}")
        return query[:256]

def add_diagnostics_ui(search_system):
    """Enhanced diagnostics with accurate path handling"""
    with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
        col1, col2 = st.columns(2)
        
        # Metadata validation
        with col1:
            st.subheader("πŸ“‚ Metadata Validation")
            metadata_dir = search_system.metadata_mgr.shard_dir
            if metadata_dir.exists():
                parquet_files = list(metadata_dir.glob("*.parquet"))
                status = len(parquet_files) > 0
                st.write(f"Directory: `{metadata_dir}`")
                st.write(f"Parquet Files: {len(parquet_files)}")
                st.success("βœ… Valid metadata" if status else "❌ No parquet files found")
            else:
                st.error("Metadata directory not found")

        # FAISS validation
        with col2:
            st.subheader("πŸ“š FAISS Validation")
            faiss_path = search_system.shard_dir
            if faiss_path.exists():
                st.write(f"Index Path: `{faiss_path}`")
                st.success(f"βœ… Index loaded")
                st.write(f"Vectors: {search_system.total_vectors}")
            else:
                st.error("FAISS index not found")

        # System resources
        st.subheader("πŸ’» System Resources")
        col_res1, col_res2 = st.columns(2)
        with col_res1:
            mem_usage = psutil.Process().memory_info().rss // 1024 ** 2
            st.metric("Memory Usage", f"{mem_usage} MB")

        with col_res2:
            cpu_usage = psutil.cpu_percent()
            status_color = "#ff0000" if cpu_usage > 80 else "#00ff00"
            st.markdown(f"<span style='color:{status_color}'>CPU: {cpu_usage}%</span>", 
                       unsafe_allow_html=True)

def main():
    st.set_page_config(
        page_title="Semantic Search Engine",
        page_icon="πŸ”",
        layout="wide"
    )

    # Custom CSS styling
    st.markdown("""
    <style>
    .metric-box {
        padding: 15px;
        border-radius: 8px;
        background: #f8f9fa;
        margin: 10px 0;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .result-card {
        padding: 15px;
        border-left: 4px solid #1e88e5;
        margin: 10px 0;
        background: #fff;
    }
    </style>
    """, unsafe_allow_html=True)

    # Initialize search system
    @st.cache_resource(ttl=3600)
    def init_search_system():
        try:
            system = SemanticSearch()
            system.initialize_system()
            logger.info("Search system initialized")
            return system
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            st.error("System initialization error")
            st.stop()

    try:
        search_system = init_search_system()
    except Exception as e:
        st.error(f"Critical error: {str(e)}")
        st.stop()

    # Main UI components
    st.title("πŸ” Research Paper Semantic Search Engine")
    query = st.text_input("Search knowledge base:", placeholder="Enter your query...")

    if query:
        clean_query = sanitize_query(query)
        if not clean_query:
            st.warning("Invalid query format")
            st.stop()

        with st.spinner("Analyzing documents..."):
            start_time = time.time()
            try:
                results = search_system.search(clean_query, 5)
                search_duration = time.time() - start_time
                
                if results.empty:
                    st.warning("No matches found")
                    st.info("Try refining your search terms")
                else:
                    st.subheader(f"Top Results ({search_duration:.2f}s)")
                    for index, res in results.iterrows():  # Use iterrows to iterate through rows if it's a DataFrame
                        logger.info(f"Results: {res}")
                        with st.expander(res["title"]):
                            st.markdown(f"**Summary**: {res['summary']}")
                            similarity = res['similarity']
                            st.progress(similarity)
                            st.markdown(f"**Confidence**: {similarity:.1%}")
                            st.markdown(f"**Authors**: {res['authors']}")
                            source = res['source']
                            if source and is_valid_url(source):
                                st.markdown(f"[View Source]({source}) | [Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})")
        
            except Exception as e:
                logger.error(f"Search error: {str(e)}")
        st.error("Search operation failed")

    # System status sidebar
    with st.sidebar:
        st.subheader("πŸ“Š System Health")
        
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}")
        
        with col2:
            vectors = search_system.total_vectors
            st.metric("Vectors", f"{vectors:,}")

        # Diagnostics section
        if st.checkbox("Show advanced diagnostics"):
            add_diagnostics_ui(search_system)

        # System monitoring
        st.subheader("βš™οΈ Monitoring")
        with st.expander("Performance"):
            mem = psutil.virtual_memory()
            st.write(f"Memory: {mem.percent}% used")
            st.write(f"CPU Cores: {psutil.cpu_count()}")
            st.write(f"Active threads: {threading.active_count()}")

        if st.button("πŸ”„ Refresh System"):
            st.cache_resource.clear()
            st.rerun()

if __name__ == "__main__":
    main()