File size: 7,462 Bytes
6129cb8
e8bab50
 
 
 
778626d
6ecbc91
 
4abeab7
c056209
cb9097f
 
c056209
90c96a2
cb9097f
e8bab50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ecbc91
 
 
 
4abeab7
6ecbc91
 
 
 
c056209
 
6ecbc91
 
4abeab7
c056209
6ecbc91
 
c056209
 
cb9097f
c056209
cb9097f
 
c056209
 
 
 
6ecbc91
c056209
 
 
 
 
371095a
c056209
0dc279f
c056209
 
 
 
 
 
 
 
 
 
2bff6ee
c056209
 
2bff6ee
ab5f789
c056209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4abeab7
7f849e0
 
 
 
 
 
6129cb8
c056209
7f849e0
 
c056209
 
 
 
 
 
6ecbc91
c056209
 
 
 
 
7f849e0
 
 
6129cb8
c056209
 
 
 
778626d
c056209
 
 
 
 
 
 
 
6ecbc91
 
 
c056209
6ecbc91
2b6cdbb
c056209
0837466
c056209
 
7f849e0
c056209
 
 
 
 
 
 
 
 
6ecbc91
 
4c7de63
c056209
 
6ecbc91
c056209
6d4fe2e
fecb48c
73b3f14
 
 
c056209
 
73b3f14
 
0d6aa1b
a911b4c
5240191
 
 
6d4fe2e
c056209
 
f848328
c056209
 
7f849e0
c056209
 
6ecbc91
 
c056209
 
6ecbc91
766799c
c056209
 
 
 
6ecbc91
 
c056209
 
 
 
 
 
 
6ecbc91
c056209
 
1193a3b
7f849e0
e8bab50
 
c8c37b3
 
 
 
 
e8bab50
c8c37b3
7f849e0
90c96a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import streamlit as st
import schedule
import time
import requests
import threading
from search_utils import SemanticSearch
import logging
import time
import psutil
from urllib.parse import quote
import threading
import re
from pathlib import Path
from urllib.parse import urlparse


def ping_server():
    try:
        print("Pinging server")
        response = requests.get("https://testys-semantic-search.hf.space")
    except requests.exceptions.RequestException as e:
        print("Server is down")
    
schedule.every(10).minutes.do(ping_server)


def run_schedule():
    while True:
        schedule.run_pending()
        time.sleep(1)


thread = threading.Thread(target=run_schedule)
thread.daemon = True
thread.start()


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("SemanticSearchApp")

# Security validation functions
def is_valid_url(url):
    """Validate URL format and safety"""
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

def sanitize_query(query):
    """Sanitize user input to prevent injection attacks"""
    try:
        return re.sub(r'[^\w\s-]', '', query)[:256]
    except Exception as e:
        logger.error(f"Query sanitization failed: {str(e)}")
        return query[:256]

def add_diagnostics_ui(search_system):
    """Enhanced diagnostics with accurate path handling"""
    with st.sidebar.expander("πŸ”§ Diagnostics", expanded=False):
        col1, col2 = st.columns(2)
        
        # Metadata validation
        with col1:
            st.subheader("πŸ“‚ Metadata Validation")
            metadata_dir = search_system.metadata_mgr.metadata_path
            if metadata_dir.exists():
                parquet_files = [metadata_dir]
                status = len(parquet_files) > 0
                st.write(f"Directory: `{metadata_dir}`")
                st.write(f"Parquet Files: {len(parquet_files)}")
                st.success("βœ… Valid metadata" if status else "❌ No parquet files found")
            else:
                st.error("Metadata directory not found")

        # FAISS validation
        with col2:
            st.subheader("πŸ“š FAISS Validation")
            faiss_path = search_system.shard_dir
            if faiss_path.exists():
                st.write(f"Index Path: `{faiss_path}`")
                st.success(f"βœ… Index loaded")
                st.write(f"Vectors: {search_system.total_vectors}")
            else:
                st.error("FAISS index not found")

        # System resources
        st.subheader("πŸ’» System Resources")
        col_res1, col_res2 = st.columns(2)
        with col_res1:
            mem_usage = psutil.Process().memory_info().rss // 1024 ** 2
            st.metric("Memory Usage", f"{mem_usage} MB")

        with col_res2:
            cpu_usage = psutil.cpu_percent()
            status_color = "#ff0000" if cpu_usage > 80 else "#00ff00"
            st.markdown(f"<span style='color:{status_color}'>CPU: {cpu_usage}%</span>", 
                       unsafe_allow_html=True)

def main():
    st.set_page_config(
        page_title="Semantic Search Engine",
        page_icon="πŸ”",
        layout="wide"
    )

    # Custom CSS styling
    st.markdown("""
    <style>
    .metric-box {
        padding: 15px;
        border-radius: 8px;
        background: #f8f9fa;
        margin: 10px 0;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .result-card {
        padding: 15px;
        border-left: 4px solid #1e88e5;
        margin: 10px 0;
        background: #fff;
    }
    </style>
    """, unsafe_allow_html=True)

    # Initialize search system
    @st.cache_resource(ttl=3600)
    def init_search_system():
        try:
            system = SemanticSearch()
            system.initialize_system()
            logger.info("Search system initialized")
            return system
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            st.error("System initialization error")
            st.stop()

    try:
        search_system = init_search_system()
    except Exception as e:
        st.error(f"Critical error: {str(e)}")
        st.stop()

    # Main UI components
    st.title("πŸ” Academics Research Semantics Search Engine")
    query = st.text_input("Search knowledge base:", placeholder="Enter your query...")

    if query:
        clean_query = sanitize_query(query)
        if not clean_query:
            st.warning("Invalid query format")
            st.stop()

        with st.spinner("Analyzing documents..."):
            start_time = time.time()
            try:
                results = search_system.search(clean_query, 5)
                search_duration = time.time() - start_time
                
                if results.empty:
                    st.warning("No matches found")
                    st.info("Try refining your search terms")
                else:
                    st.subheader(f"Top Results ({search_duration:.2f}s)")
                    for index, res in results.iterrows():  # Use iterrows to iterate through rows if it's a DataFrame
                        logger.info(f"Results: {res}")
                        with st.expander(res["title"]):
                            st.markdown(f"**Summary**: {res['summary']}")
                            similarity = res['similarity']
                            st.progress(similarity)
                            st.markdown(f"**Confidence**: {similarity:.1%}")
                            st.markdown(f"**Authors**: {res['authors']}")
                            source = res['source']
                            st.write("**Sources**:")
                            for url in res['source']:
                                st.markdown(f"- [{url}]({url})")

                            st.write(f"[Google Scholar Search](https://scholar.google.com/scholar?q={quote(res['title'])})")
        
            except Exception as e:
                logger.error(f"Search error: {str(e)}")
                st.error("Search operation failed")

    # System status sidebar
    with st.sidebar:
        st.subheader("πŸ“Š System Health")
        
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Documents", f"{search_system.metadata_mgr.total_docs:,}")
        
        with col2:
            vectors = search_system.total_vectors
            st.metric("Vectors", f"{vectors:,}")

        # Diagnostics section
        if st.checkbox("Show advanced diagnostics"):
            add_diagnostics_ui(search_system)

        # System monitoring
        st.subheader("βš™οΈ Monitoring")
        with st.expander("Performance"):
            mem = psutil.virtual_memory()
            st.write(f"Memory: {mem.percent}% used")
            st.write(f"CPU Cores: {psutil.cpu_count()}")
            st.write(f"Active threads: {threading.active_count()}")

        if st.button("πŸ”„ Refresh System"):
            st.cache_resource.clear()
            st.rerun()


    # Footer
    
    st.markdown(""" 
        <footer style="text-align: center; padding: 1rem; margin-top: 2rem; border-top: 1px solid #ddd;">
        <small>&copy; 2025 Joel Adesanya's MSc Project.</small>
        </footer>
        """, unsafe_allow_html=True)
    
if __name__ == "__main__":
    main()