naman1102 commited on
Commit
c641c76
Β·
1 Parent(s): c9a26fa
Files changed (4) hide show
  1. old_app2.py +0 -1253
  2. repo_explorer_old.py +0 -200
  3. test.py +0 -23
  4. test_vectorization.py +0 -135
old_app2.py DELETED
@@ -1,1253 +0,0 @@
1
- import gradio as gr
2
- import regex as re
3
- import csv
4
- import pandas as pd
5
- from typing import List, Dict, Tuple, Any
6
- import logging
7
- import os
8
- import time
9
-
10
- # Import core logic from other modules, as in app_old.py
11
- from analyzer import (
12
- combine_repo_files_for_llm,
13
- parse_llm_json_response,
14
- analyze_combined_file,
15
- handle_load_repository
16
- )
17
- from hf_utils import download_filtered_space_files, search_top_spaces
18
- from chatbot_page import chat_with_user, extract_keywords_from_conversation
19
- from repo_explorer import create_repo_explorer_tab, setup_repo_explorer_events
20
-
21
- # --- Configuration ---
22
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
- logger = logging.getLogger(__name__)
24
-
25
- CSV_FILE = "repo_ids.csv"
26
- CHATBOT_SYSTEM_PROMPT = (
27
- "You are a helpful assistant whose ONLY job is to gather information about the user's ideal repository requirements. "
28
- "DO NOT suggest any specific repositories or give repository recommendations. "
29
- "Your role is to ask clarifying questions to understand exactly what the user is looking for. "
30
- "Ask about their use case, preferred programming language, specific features needed, project type, etc. "
31
- "When you feel you have gathered enough detailed information about their requirements, "
32
- "tell the user: 'I think I have enough information about your requirements. Please click the Extract Keywords button to search for repositories.' "
33
- "Focus on understanding their needs, not providing solutions."
34
- )
35
- CHATBOT_INITIAL_MESSAGE = "Hello! I'm here to help you define your ideal Hugging Face repository requirements. I won't suggest specific repos - my job is to understand exactly what you're looking for. Tell me about your project: What type of application are you building? What's your use case?"
36
-
37
- # --- Helper Functions (Logic) ---
38
-
39
- def get_top_relevant_repos(df: pd.DataFrame, user_requirements: str, top_n: int = 3) -> pd.DataFrame:
40
- """
41
- Uses LLM to select the top N most relevant repositories based on user requirements and analysis data.
42
- """
43
- try:
44
- if df.empty:
45
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
46
-
47
- # Filter out rows with no analysis data
48
- analyzed_df = df.copy()
49
- analyzed_df = analyzed_df[
50
- (analyzed_df['strength'].str.strip() != '') |
51
- (analyzed_df['weaknesses'].str.strip() != '') |
52
- (analyzed_df['speciality'].str.strip() != '') |
53
- (analyzed_df['relevance rating'].str.strip() != '')
54
- ]
55
-
56
- if analyzed_df.empty:
57
- logger.warning("No analyzed repositories found for LLM selection")
58
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
59
-
60
- # Create a prompt for the LLM
61
- csv_data = ""
62
- for idx, row in analyzed_df.iterrows():
63
- csv_data += f"Repository: {row['repo id']}\n"
64
- csv_data += f"Strengths: {row['strength']}\n"
65
- csv_data += f"Weaknesses: {row['weaknesses']}\n"
66
- csv_data += f"Speciality: {row['speciality']}\n"
67
- csv_data += f"Relevance: {row['relevance rating']}\n\n"
68
-
69
- user_context = user_requirements if user_requirements.strip() else "General repository recommendation"
70
-
71
- prompt = f"""Based on the user's requirements and the analysis of repositories below, select the top {top_n} most relevant repositories.
72
-
73
- User Requirements:
74
- {user_context}
75
-
76
- Repository Analysis Data:
77
- {csv_data}
78
-
79
- Please analyze all repositories and select the {top_n} most relevant ones based on:
80
- 1. How well they match the user's specific requirements
81
- 2. Their strengths and capabilities
82
- 3. Their relevance rating
83
- 4. Their speciality alignment with user needs
84
-
85
- Return ONLY a JSON list of the repository IDs in order of relevance (most relevant first). Example format:
86
- ["repo1", "repo2", "repo3"]
87
-
88
- Selected repositories:"""
89
-
90
- try:
91
- from openai import OpenAI
92
- client = OpenAI(api_key=os.getenv("modal_api"))
93
- client.base_url = os.getenv("base_url")
94
-
95
- response = client.chat.completions.create(
96
- model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
97
- messages=[
98
- {"role": "system", "content": "You are an expert at analyzing and ranking repositories based on user requirements. Always return valid JSON."},
99
- {"role": "user", "content": prompt}
100
- ],
101
- max_tokens=200,
102
- temperature=0.3
103
- )
104
-
105
- llm_response = response.choices[0].message.content.strip()
106
- logger.info(f"LLM response for top repos: {llm_response}")
107
-
108
- # Extract JSON from response
109
- import json
110
- import re
111
-
112
- # Try to find JSON array in the response
113
- json_match = re.search(r'\[.*\]', llm_response)
114
- if json_match:
115
- selected_repos = json.loads(json_match.group())
116
- logger.info(f"LLM selected repositories: {selected_repos}")
117
-
118
- # Filter dataframe to only include selected repositories in order
119
- top_repos_list = []
120
- for repo_id in selected_repos[:top_n]:
121
- matching_rows = analyzed_df[analyzed_df['repo id'] == repo_id]
122
- if not matching_rows.empty:
123
- top_repos_list.append(matching_rows.iloc[0])
124
-
125
- if top_repos_list:
126
- top_repos = pd.DataFrame(top_repos_list)
127
- logger.info(f"Successfully selected {len(top_repos)} repositories using LLM")
128
- return top_repos
129
-
130
- # Fallback: if LLM response parsing fails, use first N analyzed repos
131
- logger.warning("Failed to parse LLM response, using fallback selection")
132
- return analyzed_df.head(top_n)
133
-
134
- except Exception as llm_error:
135
- logger.error(f"LLM selection failed: {llm_error}")
136
- # Fallback: return first N repositories with analysis data
137
- return analyzed_df.head(top_n)
138
-
139
- except Exception as e:
140
- logger.error(f"Error in LLM-based repo selection: {e}")
141
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
142
-
143
- def write_repos_to_csv(repo_ids: List[str]) -> None:
144
- """Writes a list of repo IDs to the CSV file, overwriting the previous content."""
145
- try:
146
- with open(CSV_FILE, mode="w", newline='', encoding="utf-8") as csvfile:
147
- writer = csv.writer(csvfile)
148
- writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
149
- for repo_id in repo_ids:
150
- writer.writerow([repo_id, "", "", "", ""])
151
- logger.info(f"Wrote {len(repo_ids)} repo IDs to {CSV_FILE}")
152
- except Exception as e:
153
- logger.error(f"Error writing to CSV: {e}")
154
-
155
- def format_text_for_dataframe(text: str, max_length: int = 200) -> str:
156
- """Format text for better display in dataframe by truncating and cleaning."""
157
- if not text or pd.isna(text):
158
- return ""
159
-
160
- # Clean the text
161
- text = str(text).strip()
162
-
163
- # Remove excessive whitespace and newlines
164
- text = re.sub(r'\s+', ' ', text)
165
-
166
- # Truncate if too long
167
- if len(text) > max_length:
168
- text = text[:max_length-3] + "..."
169
-
170
- return text
171
-
172
- def read_csv_to_dataframe() -> pd.DataFrame:
173
- """Reads the CSV file into a pandas DataFrame with full text preserved."""
174
- try:
175
- df = pd.read_csv(CSV_FILE, dtype=str).fillna('')
176
-
177
- # Keep the full text intact - don't truncate here
178
- # The truncation will be handled in the UI display layer
179
-
180
- return df
181
- except FileNotFoundError:
182
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
183
- except Exception as e:
184
- logger.error(f"Error reading CSV: {e}")
185
- return pd.DataFrame()
186
-
187
- def format_dataframe_for_display(df: pd.DataFrame) -> pd.DataFrame:
188
- """Returns dataframe with full text (no truncation) for display."""
189
- if df.empty:
190
- return df
191
-
192
- # Return the dataframe as-is without any text truncation
193
- # This will show the full text content in the CSV display
194
- return df.copy()
195
-
196
- def analyze_and_update_single_repo(repo_id: str, user_requirements: str = "") -> Tuple[str, str, pd.DataFrame]:
197
- """
198
- Downloads, analyzes a single repo, updates the CSV, and returns results.
199
- Now includes user requirements for better relevance rating.
200
- This function combines the logic of downloading, analyzing, and updating the CSV for one repo.
201
- """
202
- try:
203
- logger.info(f"Starting analysis for repo: {repo_id}")
204
- download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=['.py', '.md', '.txt'])
205
- txt_path = combine_repo_files_for_llm()
206
-
207
- with open(txt_path, "r", encoding="utf-8") as f:
208
- combined_content = f.read()
209
-
210
- llm_output = analyze_combined_file(txt_path, user_requirements)
211
-
212
- last_start = llm_output.rfind('{')
213
- last_end = llm_output.rfind('}')
214
- final_json_str = llm_output[last_start:last_end+1] if last_start != -1 and last_end != -1 else "{}"
215
-
216
- llm_json = parse_llm_json_response(final_json_str)
217
-
218
- summary = ""
219
- if isinstance(llm_json, dict) and "error" not in llm_json:
220
- strengths = llm_json.get("strength", "N/A")
221
- weaknesses = llm_json.get("weaknesses", "N/A")
222
- relevance = llm_json.get("relevance rating", "N/A")
223
- summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}\n\nRelevance: {relevance}"
224
- else:
225
- summary = f"JSON extraction: FAILED\nRaw response might not be valid JSON."
226
-
227
- # Update CSV
228
- df = read_csv_to_dataframe()
229
- repo_found_in_df = False
230
- for idx, row in df.iterrows():
231
- if row["repo id"] == repo_id:
232
- if isinstance(llm_json, dict):
233
- df.at[idx, "strength"] = llm_json.get("strength", "")
234
- df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "")
235
- df.at[idx, "speciality"] = llm_json.get("speciality", "")
236
- df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
237
- repo_found_in_df = True
238
- break
239
-
240
- if not repo_found_in_df:
241
- logger.warning(f"Repo ID {repo_id} not found in CSV for updating.")
242
-
243
- # Write CSV with better error handling and flushing
244
- try:
245
- df.to_csv(CSV_FILE, index=False)
246
- # Force file system flush
247
- os.sync() if hasattr(os, 'sync') else None
248
- logger.info(f"Successfully updated CSV for {repo_id}")
249
- except Exception as csv_error:
250
- logger.error(f"Failed to write CSV for {repo_id}: {csv_error}")
251
- # Try once more with a small delay
252
- time.sleep(0.2)
253
- try:
254
- df.to_csv(CSV_FILE, index=False)
255
- logger.info(f"Successfully updated CSV for {repo_id} on retry")
256
- except Exception as retry_error:
257
- logger.error(f"Failed to write CSV for {repo_id} on retry: {retry_error}")
258
-
259
- logger.info(f"Successfully analyzed and updated CSV for {repo_id}")
260
- return combined_content, summary, df
261
-
262
- except Exception as e:
263
- logger.error(f"An error occurred during analysis of {repo_id}: {e}")
264
- error_summary = f"Error analyzing repo: {e}"
265
- return "", error_summary, format_dataframe_for_display(read_csv_to_dataframe())
266
-
267
- # --- NEW: Helper for Chat History Conversion ---
268
- def convert_messages_to_tuples(history: List[Dict[str, str]]) -> List[Tuple[str, str]]:
269
- """
270
- Converts Gradio's 'messages' format to the old 'tuple' format for compatibility.
271
- This robust version correctly handles histories that start with an assistant message.
272
- """
273
- tuple_history = []
274
- # Iterate through the history to find user messages
275
- for i, msg in enumerate(history):
276
- if msg['role'] == 'user':
277
- # Once a user message is found, check if the next message is from the assistant
278
- if i + 1 < len(history) and history[i+1]['role'] == 'assistant':
279
- user_content = msg['content']
280
- assistant_content = history[i+1]['content']
281
- tuple_history.append((user_content, assistant_content))
282
- return tuple_history
283
-
284
- # --- Gradio UI ---
285
-
286
- def create_ui() -> gr.Blocks:
287
- """Creates and configures the entire Gradio interface."""
288
-
289
- css = """
290
- /* Modern sleek design */
291
- .gradio-container {
292
- font-family: 'Inter', 'system-ui', sans-serif;
293
- background: linear-gradient(135deg, #0a0a0a 0%, #1a1a1a 100%);
294
- min-height: 100vh;
295
- }
296
-
297
- .gr-form {
298
- background: rgba(255, 255, 255, 0.95);
299
- backdrop-filter: blur(10px);
300
- border-radius: 16px;
301
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
302
- padding: 24px;
303
- margin: 16px;
304
- border: 1px solid rgba(255, 255, 255, 0.2);
305
- }
306
-
307
- .gr-button {
308
- background: linear-gradient(45deg, #667eea, #764ba2);
309
- border: none;
310
- border-radius: 12px;
311
- color: white;
312
- font-weight: 600;
313
- padding: 12px 24px;
314
- transition: all 0.3s ease;
315
- box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4);
316
- }
317
-
318
- .gr-button:hover {
319
- transform: translateY(-2px);
320
- box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6);
321
- }
322
-
323
- .gr-textbox {
324
- border: 2px solid rgba(102, 126, 234, 0.2);
325
- border-radius: 12px;
326
- background: rgba(255, 255, 255, 0.9);
327
- transition: all 0.3s ease;
328
- }
329
-
330
- .gr-textbox:focus {
331
- border-color: #667eea;
332
- box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
333
- }
334
-
335
- .gr-panel {
336
- background: rgba(255, 255, 255, 0.95);
337
- border-radius: 16px;
338
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
339
- border: 1px solid rgba(255, 255, 255, 0.2);
340
- }
341
-
342
- .gr-tab-nav {
343
- background: rgba(255, 255, 255, 0.95);
344
- border-radius: 12px 12px 0 0;
345
- backdrop-filter: blur(10px);
346
- }
347
-
348
- .gr-tab-nav button {
349
- background: transparent;
350
- border: none;
351
- padding: 16px 24px;
352
- font-weight: 600;
353
- color: #666;
354
- transition: all 0.3s ease;
355
- }
356
-
357
- .gr-tab-nav button.selected {
358
- background: linear-gradient(45deg, #667eea, #764ba2);
359
- color: white;
360
- border-radius: 8px;
361
- }
362
-
363
- .chatbot {
364
- border-radius: 16px;
365
- box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1);
366
- }
367
-
368
- /* Hide Gradio footer */
369
- footer {
370
- display: none !important;
371
- }
372
-
373
- /* Custom scrollbar */
374
- ::-webkit-scrollbar {
375
- width: 8px;
376
- }
377
-
378
- ::-webkit-scrollbar-track {
379
- background: rgba(255, 255, 255, 0.1);
380
- border-radius: 4px;
381
- }
382
-
383
- ::-webkit-scrollbar-thumb {
384
- background: linear-gradient(45deg, #667eea, #764ba2);
385
- border-radius: 4px;
386
- }
387
-
388
- /* Improved dataframe styling for full text display */
389
- .gr-dataframe {
390
- border-radius: 12px;
391
- overflow: hidden;
392
- box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1);
393
- background: rgba(255, 255, 255, 0.98);
394
- }
395
-
396
- .gr-dataframe table {
397
- width: 100%;
398
- table-layout: fixed;
399
- border-collapse: collapse;
400
- }
401
-
402
- /* Column width specifications for both dataframes */
403
- .gr-dataframe th,
404
- .gr-dataframe td {
405
- padding: 12px 16px;
406
- text-align: left;
407
- border-bottom: 1px solid rgba(0, 0, 0, 0.1);
408
- font-size: 0.95rem;
409
- line-height: 1.4;
410
- }
411
-
412
- /* Specific column widths - applying to both dataframes */
413
- .gr-dataframe th:nth-child(1),
414
- .gr-dataframe td:nth-child(1) { width: 16.67% !important; min-width: 16.67% !important; max-width: 16.67% !important; }
415
- .gr-dataframe th:nth-child(2),
416
- .gr-dataframe td:nth-child(2) { width: 25% !important; min-width: 25% !important; max-width: 25% !important; }
417
- .gr-dataframe th:nth-child(3),
418
- .gr-dataframe td:nth-child(3) { width: 25% !important; min-width: 25% !important; max-width: 25% !important; }
419
- .gr-dataframe th:nth-child(4),
420
- .gr-dataframe td:nth-child(4) { width: 20.83% !important; min-width: 20.83% !important; max-width: 20.83% !important; }
421
- .gr-dataframe th:nth-child(5),
422
- .gr-dataframe td:nth-child(5) { width: 12.5% !important; min-width: 12.5% !important; max-width: 12.5% !important; }
423
-
424
- /* Additional specific targeting for both dataframes */
425
- div[data-testid="dataframe"] table th:nth-child(1),
426
- div[data-testid="dataframe"] table td:nth-child(1) { width: 16.67% !important; }
427
- div[data-testid="dataframe"] table th:nth-child(2),
428
- div[data-testid="dataframe"] table td:nth-child(2) { width: 25% !important; }
429
- div[data-testid="dataframe"] table th:nth-child(3),
430
- div[data-testid="dataframe"] table td:nth-child(3) { width: 25% !important; }
431
- div[data-testid="dataframe"] table th:nth-child(4),
432
- div[data-testid="dataframe"] table td:nth-child(4) { width: 20.83% !important; }
433
- div[data-testid="dataframe"] table th:nth-child(5),
434
- div[data-testid="dataframe"] table td:nth-child(5) { width: 12.5% !important; }
435
-
436
- /* Make repository names clickable */
437
- .gr-dataframe td:nth-child(1) {
438
- cursor: pointer;
439
- color: #667eea;
440
- font-weight: 600;
441
- transition: all 0.3s ease;
442
- }
443
-
444
- .gr-dataframe td:nth-child(1):hover {
445
- background-color: rgba(102, 126, 234, 0.1);
446
- color: #764ba2;
447
- transform: scale(1.02);
448
- }
449
-
450
- /* Content columns - readable styling with scroll for long text */
451
- .gr-dataframe td:nth-child(2),
452
- .gr-dataframe td:nth-child(3),
453
- .gr-dataframe td:nth-child(4),
454
- .gr-dataframe td:nth-child(5) {
455
- cursor: default;
456
- font-size: 0.9rem;
457
- }
458
-
459
- .gr-dataframe tbody tr:hover {
460
- background-color: rgba(102, 126, 234, 0.05);
461
- }
462
-
463
- /* JavaScript for auto-scroll to top on tab change */
464
- <script>
465
- document.addEventListener('DOMContentLoaded', function() {
466
- // Function to scroll to top
467
- function scrollToTop() {
468
- window.scrollTo({
469
- top: 0,
470
- behavior: 'smooth'
471
- });
472
- }
473
-
474
- // Observer for tab changes
475
- const observer = new MutationObserver(function(mutations) {
476
- mutations.forEach(function(mutation) {
477
- if (mutation.type === 'attributes' && mutation.attributeName === 'class') {
478
- const target = mutation.target;
479
- if (target.classList && target.classList.contains('selected')) {
480
- // Tab was selected, scroll to top
481
- setTimeout(scrollToTop, 100);
482
- }
483
- }
484
- });
485
- });
486
-
487
- // Observe tab navigation buttons
488
- const tabButtons = document.querySelectorAll('.gr-tab-nav button');
489
- tabButtons.forEach(button => {
490
- observer.observe(button, { attributes: true });
491
-
492
- // Also add click listener for immediate scroll
493
- button.addEventListener('click', function() {
494
- setTimeout(scrollToTop, 150);
495
- });
496
- });
497
-
498
- // Enhanced listener for programmatic tab changes (button-triggered navigation)
499
- let lastSelectedTab = null;
500
- const checkInterval = setInterval(function() {
501
- const currentSelectedTab = document.querySelector('.gr-tab-nav button.selected');
502
- if (currentSelectedTab && currentSelectedTab !== lastSelectedTab) {
503
- lastSelectedTab = currentSelectedTab;
504
- setTimeout(scrollToTop, 100);
505
- }
506
- }, 100);
507
-
508
- // Additional scroll trigger for repo explorer navigation
509
- window.addEventListener('repoExplorerNavigation', function() {
510
- setTimeout(scrollToTop, 200);
511
- });
512
-
513
- // Watch for specific tab transitions to repo explorer
514
- const repoExplorerObserver = new MutationObserver(function(mutations) {
515
- mutations.forEach(function(mutation) {
516
- if (mutation.type === 'attributes' && mutation.attributeName === 'class') {
517
- const target = mutation.target;
518
- if (target.textContent && target.textContent.includes('πŸ” Repo Explorer') && target.classList.contains('selected')) {
519
- setTimeout(scrollToTop, 150);
520
- }
521
- }
522
- });
523
- });
524
-
525
- // Start observing for repo explorer specific changes
526
- setTimeout(function() {
527
- const repoExplorerTab = Array.from(document.querySelectorAll('.gr-tab-nav button')).find(btn =>
528
- btn.textContent && btn.textContent.includes('πŸ” Repo Explorer')
529
- );
530
- if (repoExplorerTab) {
531
- repoExplorerObserver.observe(repoExplorerTab, { attributes: true });
532
- }
533
- }, 1000);
534
- });
535
- </script>
536
- """
537
-
538
- with gr.Blocks(
539
- theme=gr.themes.Soft(
540
- primary_hue="blue",
541
- secondary_hue="purple",
542
- neutral_hue="gray",
543
- font=["Inter", "system-ui", "sans-serif"]
544
- ),
545
- css=css,
546
- title="πŸš€ HF Repo Analyzer"
547
- ) as app:
548
-
549
- # --- State Management ---
550
- # Using simple, separate state objects for robustness.
551
- repo_ids_state = gr.State([])
552
- current_repo_idx_state = gr.State(0)
553
- user_requirements_state = gr.State("") # Store user requirements from chatbot
554
- loaded_repo_content_state = gr.State("") # Store loaded repository content
555
- current_repo_id_state = gr.State("") # Store current repository ID
556
- selected_repo_id_state = gr.State("") # Store selected repository ID for modal actions
557
-
558
- gr.Markdown(
559
- """
560
- <div style="text-align: center; padding: 40px 20px; background: rgba(255, 255, 255, 0.1); border-radius: 20px; margin: 20px auto; max-width: 900px; backdrop-filter: blur(10px);">
561
- <h1 style="font-size: 3.5rem; font-weight: 800; margin: 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;">
562
- πŸš€ HF Repo Analyzer
563
- </h1>
564
- <p style="font-size: 1.3rem; color: rgba(255, 255, 255, 0.9); margin: 16px 0 0 0; font-weight: 400; line-height: 1.6;">
565
- Discover, analyze, and evaluate Hugging Face repositories with AI-powered insights
566
- </p>
567
- <div style="height: 4px; width: 80px; background: linear-gradient(45deg, #667eea, #764ba2); margin: 24px auto; border-radius: 2px;"></div>
568
- </div>
569
- """
570
- )
571
-
572
- # Global Reset Button - visible on all tabs
573
- with gr.Row():
574
- with gr.Column(scale=4):
575
- pass
576
- with gr.Column(scale=1):
577
- reset_all_btn = gr.Button("πŸ”„ Reset Everything", variant="stop", size="lg")
578
- with gr.Column(scale=1):
579
- pass
580
-
581
- with gr.Tabs() as tabs:
582
- # --- Input Tab ---
583
- with gr.TabItem("πŸ“ Input & Search", id="input_tab"):
584
- with gr.Row(equal_height=True):
585
- with gr.Column(scale=1):
586
- gr.Markdown("### πŸ“ Repository IDs")
587
- repo_id_input = gr.Textbox(
588
- label="Repository IDs",
589
- lines=8,
590
- placeholder="microsoft/DialoGPT-medium\nopenai/whisper\nhuggingface/transformers",
591
- info="Enter repo IDs separated by commas or new lines"
592
- )
593
- submit_repo_btn = gr.Button("πŸš€ Submit Repositories", variant="primary", size="lg")
594
-
595
- with gr.Column(scale=1):
596
- gr.Markdown("### πŸ” Keyword Search")
597
- keyword_input = gr.Textbox(
598
- label="Search Keywords",
599
- lines=8,
600
- placeholder="text generation\nimage classification\nsentiment analysis",
601
- info="Enter keywords to find relevant repositories"
602
- )
603
- search_btn = gr.Button("πŸ”Ž Search Repositories", variant="primary", size="lg")
604
-
605
- status_box_input = gr.Textbox(label="πŸ“Š Status", interactive=False, lines=2)
606
-
607
- # --- Analysis Tab ---
608
- with gr.TabItem("πŸ”¬ Analysis", id="analysis_tab"):
609
- gr.Markdown("### πŸ§ͺ Repository Analysis Engine")
610
-
611
- # Display current user requirements
612
- with gr.Row():
613
- current_requirements_display = gr.Textbox(
614
- label="πŸ“‹ Current User Requirements",
615
- interactive=False,
616
- lines=3,
617
- info="Requirements extracted from AI chat conversation for relevance rating"
618
- )
619
-
620
- with gr.Row():
621
- analyze_all_btn = gr.Button("πŸš€ Analyze All Repositories", variant="primary", size="lg", scale=1)
622
- with gr.Column(scale=2):
623
- status_box_analysis = gr.Textbox(label="πŸ“ˆ Analysis Status", interactive=False, lines=2)
624
-
625
- # Progress bar for batch analysis
626
- with gr.Row():
627
- analysis_progress = gr.Progress()
628
- # progress_display = gr.Textbox(
629
- # label="πŸ“Š Batch Analysis Progress",
630
- # interactive=False,
631
- # lines=2,
632
- # visible=False,
633
- # info="Shows progress when analyzing all repositories"
634
- # )
635
-
636
- with gr.Row(equal_height=True):
637
- # with gr.Column():
638
- # content_output = gr.Textbox(
639
- # label="πŸ“„ Repository Content",
640
- # lines=20,
641
- # show_copy_button=True,
642
- # info="Raw content extracted from the repository"
643
- # )
644
- # with gr.Column():
645
- # summary_output = gr.Textbox(
646
- # label="🎯 AI Analysis Summary",
647
- # lines=20,
648
- # show_copy_button=True,
649
- # info="Detailed analysis and insights from AI"
650
- # )
651
- pass
652
-
653
- gr.Markdown("### πŸ“Š Results Dashboard")
654
-
655
- # Top 3 Most Relevant Repositories (initially hidden)
656
- with gr.Column(visible=False) as top_repos_section:
657
- gr.Markdown("### πŸ† Top 3 Most Relevant Repositories")
658
- gr.Markdown("🎯 **These are the highest-rated repositories based on your requirements:**")
659
- top_repos_df = gr.Dataframe(
660
- headers=["Repository", "Strengths", "Weaknesses", "Speciality", "Relevance"],
661
- column_widths=["16.67%", "25%", "25%", "20.83%", "12.5%"],
662
- wrap=True,
663
- interactive=False
664
- )
665
-
666
- gr.Markdown("πŸ’‘ **Tip:** Full text is displayed directly in the table. Click on repository names to explore or visit them!")
667
-
668
- # Text expansion modal for showing full content (kept for backwards compatibility)
669
- with gr.Row():
670
- with gr.Column():
671
- text_expansion_modal = gr.Column(visible=False)
672
- with text_expansion_modal:
673
- gr.Markdown("### πŸ“„ Full Content View")
674
- expanded_content_title = gr.Textbox(
675
- label="Content Type",
676
- interactive=False,
677
- info="Full text content for the selected field"
678
- )
679
- expanded_content_text = gr.Textbox(
680
- label="Full Text",
681
- lines=10,
682
- interactive=False,
683
- show_copy_button=True,
684
- info="Complete untruncated content"
685
- )
686
- close_text_modal_btn = gr.Button("❌ Close", size="lg")
687
-
688
- # Modal popup for repository action selection
689
- with gr.Row():
690
- with gr.Column():
691
- repo_action_modal = gr.Column(visible=False)
692
- with repo_action_modal:
693
- gr.Markdown("### πŸ”— Repository Actions")
694
- selected_repo_display = gr.Textbox(
695
- label="Selected Repository",
696
- interactive=False,
697
- info="Choose what you'd like to do with this repository"
698
- )
699
- with gr.Row():
700
- visit_repo_btn = gr.Button("🌐 Visit Hugging Face Space", variant="primary", size="lg")
701
- explore_repo_btn = gr.Button("πŸ” Open in Repo Explorer", variant="secondary", size="lg")
702
- cancel_modal_btn = gr.Button("❌ Cancel", size="lg")
703
-
704
- gr.Markdown("### πŸ“‹ All Analysis Results")
705
- df_output = gr.Dataframe(
706
- headers=["Repository", "Strengths", "Weaknesses", "Speciality", "Relevance"],
707
- column_widths=["16.67%", "25%", "25%", "20.83%", "12.5%"],
708
- wrap=True,
709
- interactive=False
710
- )
711
-
712
- # --- Chatbot Tab ---
713
- with gr.TabItem("πŸ€– AI Assistant", id="chatbot_tab"):
714
- gr.Markdown("### πŸ’¬ Intelligent Repository Discovery")
715
-
716
- chatbot = gr.Chatbot(
717
- label="πŸ€– AI Assistant",
718
- height=450,
719
- type="messages",
720
- avatar_images=(
721
- "https://cdn-icons-png.flaticon.com/512/149/149071.png",
722
- "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"
723
- ),
724
- show_copy_button=True
725
- )
726
-
727
- with gr.Row():
728
- msg_input = gr.Textbox(
729
- label="πŸ’­ Your Message",
730
- placeholder="Tell me about your ideal repository...",
731
- lines=1,
732
- scale=4,
733
- info="Describe what you're looking for"
734
- )
735
- send_btn = gr.Button("πŸ“€ Send", variant="primary", scale=1)
736
- end_chat_btn = gr.Button("🎯 Extract Keywords", scale=1)
737
- use_keywords_btn = gr.Button("πŸ”Ž Search Now", variant="primary", scale=1)
738
-
739
- with gr.Row():
740
- with gr.Column():
741
- extracted_keywords_output = gr.Textbox(
742
- label="🏷️ Extracted Keywords",
743
- interactive=False,
744
- show_copy_button=True,
745
- info="AI-generated search terms from our conversation"
746
- )
747
- with gr.Column():
748
- status_box_chatbot = gr.Textbox(
749
- label="πŸ“Š Chat Status",
750
- interactive=False,
751
- info="Current conversation status"
752
- )
753
-
754
- # --- Repo Explorer Tab ---
755
- with gr.TabItem("πŸ” Repo Explorer", id="repo_explorer_tab"):
756
- repo_components, repo_states = create_repo_explorer_tab()
757
-
758
- # --- Footer ---
759
- gr.Markdown(
760
- """
761
- <div style="text-align: center; padding: 30px 20px; margin-top: 40px; background: rgba(255, 255, 255, 0.1); border-radius: 16px; backdrop-filter: blur(10px);">
762
- <p style="margin: 0; color: rgba(255, 255, 255, 0.8); font-size: 0.95rem; font-weight: 500;">
763
- πŸš€ Powered by <span style="background: linear-gradient(45deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 700;">Gradio</span>
764
- & <span style="background: linear-gradient(45deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 700;">Hugging Face</span>
765
- </p>
766
- <div style="height: 2px; width: 60px; background: linear-gradient(45deg, #667eea, #764ba2); margin: 16px auto; border-radius: 1px;"></div>
767
- </div>
768
- """
769
- )
770
-
771
- # --- Event Handler Functions ---
772
-
773
- def handle_repo_id_submission(text: str) -> Tuple[List[str], int, pd.DataFrame, str, Any]:
774
- """Processes submitted repo IDs, updates state, and prepares for analysis."""
775
- if not text:
776
- return [], 0, pd.DataFrame(), "Status: Please enter repository IDs.", gr.update(selected="input_tab")
777
-
778
- repo_ids = list(dict.fromkeys([repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]))
779
- write_repos_to_csv(repo_ids)
780
- df = format_dataframe_for_display(read_csv_to_dataframe())
781
- status = f"Status: {len(repo_ids)} repositories submitted. Ready for analysis."
782
- return repo_ids, 0, df, status, gr.update(selected="analysis_tab")
783
-
784
- def handle_keyword_search(keywords: str) -> Tuple[List[str], int, pd.DataFrame, str, Any]:
785
- """Processes submitted keywords, finds repos, updates state, and prepares for analysis."""
786
- if not keywords:
787
- return [], 0, pd.DataFrame(), "Status: Please enter keywords.", gr.update(selected="input_tab")
788
-
789
- keyword_list = [k.strip() for k in re.split(r'[\n,]+', keywords) if k.strip()]
790
- repo_ids = []
791
- for kw in keyword_list:
792
- repo_ids.extend(search_top_spaces(kw, limit=5))
793
-
794
- unique_repo_ids = list(dict.fromkeys(repo_ids))
795
- write_repos_to_csv(unique_repo_ids)
796
- df = format_dataframe_for_display(read_csv_to_dataframe())
797
- status = f"Status: Found {len(unique_repo_ids)} repositories. Ready for analysis."
798
- return unique_repo_ids, 0, df, status, gr.update(selected="analysis_tab")
799
-
800
- def extract_user_requirements_from_chat(history: List[Dict[str, str]]) -> str:
801
- """Extract user requirements from chatbot conversation."""
802
- if not history:
803
- return ""
804
-
805
- user_messages = []
806
- for msg in history:
807
- if msg.get('role') == 'user':
808
- user_messages.append(msg.get('content', ''))
809
-
810
- if not user_messages:
811
- return ""
812
-
813
- # Combine all user messages as requirements
814
- requirements = "\n".join([f"- {msg}" for msg in user_messages if msg.strip()])
815
- return requirements
816
-
817
- def handle_user_message(user_message: str, history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str]:
818
- """Appends the user's message to the history, preparing for the bot's response."""
819
- # Initialize chatbot with welcome message if empty
820
- if not history:
821
- history = [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}]
822
-
823
- if user_message:
824
- history.append({"role": "user", "content": user_message})
825
- return history, ""
826
-
827
- def handle_bot_response(history: List[Dict[str, str]]) -> List[Dict[str, str]]:
828
- """Generates and appends the bot's response using the compatible history format."""
829
- if not history or history[-1]["role"] != "user":
830
- return history
831
-
832
- user_message = history[-1]["content"]
833
- # Convert all messages *before* the last user message into tuples for the API
834
- tuple_history_for_api = convert_messages_to_tuples(history[:-1])
835
-
836
- response = chat_with_user(user_message, tuple_history_for_api)
837
- history.append({"role": "assistant", "content": response})
838
- return history
839
-
840
- def handle_end_chat(history: List[Dict[str, str]]) -> Tuple[str, str, str]:
841
- """Ends the chat, extracts and sanitizes keywords from the conversation, and extracts user requirements."""
842
- if not history:
843
- return "", "Status: Chat is empty, nothing to analyze.", ""
844
-
845
- # Convert the full, valid history for the extraction logic
846
- tuple_history = convert_messages_to_tuples(history)
847
- if not tuple_history:
848
- return "", "Status: No completed conversations to analyze.", ""
849
-
850
- # Get raw keywords string from the LLM
851
- raw_keywords_str = extract_keywords_from_conversation(tuple_history)
852
-
853
- # Sanitize the LLM output to extract only keyword-like parts.
854
- # A keyword can contain letters, numbers, underscores, spaces, and hyphens.
855
- cleaned_keywords = re.findall(r'[\w\s-]+', raw_keywords_str)
856
-
857
- # Trim whitespace from each found keyword and filter out any empty strings
858
- cleaned_keywords = [kw.strip() for kw in cleaned_keywords if kw.strip()]
859
-
860
- if not cleaned_keywords:
861
- return "", f"Status: Could not extract valid keywords. Raw LLM output: '{raw_keywords_str}'", ""
862
-
863
- # Join them into a clean, comma-separated string for the search tool
864
- final_keywords_str = ", ".join(cleaned_keywords)
865
-
866
- # Extract user requirements for analysis
867
- user_requirements = extract_user_requirements_from_chat(history)
868
-
869
- status = "Status: Keywords extracted. User requirements saved for analysis."
870
- return final_keywords_str, status, user_requirements
871
-
872
- def handle_dataframe_select(evt: gr.SelectData, df_data) -> Tuple[str, Any, Any, str, str, Any, str]:
873
- """Handle dataframe row selection - only repo ID (column 0) shows modal since full text is now displayed directly."""
874
- print(f"DEBUG: Selection event triggered!")
875
- print(f"DEBUG: evt = {evt}")
876
- print(f"DEBUG: df_data type = {type(df_data)}")
877
-
878
- if evt is None:
879
- return "", gr.update(visible=False), gr.update(), "", "", gr.update(visible=False), ""
880
-
881
- try:
882
- # Get the selected row and column from the event
883
- row_idx = evt.index[0]
884
- col_idx = evt.index[1]
885
- print(f"DEBUG: Selected row {row_idx}, column {col_idx}")
886
-
887
- # Handle pandas DataFrame
888
- if isinstance(df_data, pd.DataFrame) and not df_data.empty and row_idx < len(df_data):
889
-
890
- if col_idx == 0: # Repository name column - show action modal
891
- repo_id = df_data.iloc[row_idx, 0]
892
- print(f"DEBUG: Extracted repo_id = '{repo_id}'")
893
-
894
- if repo_id and str(repo_id).strip() and str(repo_id).strip() != 'nan':
895
- clean_repo_id = str(repo_id).strip()
896
- logger.info(f"Showing modal for repository: {clean_repo_id}")
897
- return clean_repo_id, gr.update(visible=True), gr.update(), "", "", gr.update(visible=False), clean_repo_id
898
-
899
- # For content columns (1,2,3) and relevance (4), do nothing since full text is shown directly
900
- else:
901
- print(f"DEBUG: Clicked on column {col_idx}, full text already shown in table")
902
- return "", gr.update(visible=False), gr.update(), "", "", gr.update(visible=False), ""
903
- else:
904
- print(f"DEBUG: df_data is not a DataFrame or row_idx {row_idx} out of range")
905
-
906
- except Exception as e:
907
- print(f"DEBUG: Exception occurred: {e}")
908
- logger.error(f"Error handling dataframe selection: {e}")
909
-
910
- return "", gr.update(visible=False), gr.update(), "", "", gr.update(visible=False), ""
911
-
912
- def handle_analyze_all_repos(repo_ids: List[str], user_requirements: str, progress=gr.Progress()) -> Tuple[pd.DataFrame, str, pd.DataFrame, Any]:
913
- """Analyzes all repositories in the CSV file with progress tracking."""
914
- if not repo_ids:
915
- return pd.DataFrame(), "Status: No repositories to analyze. Please submit repo IDs first.", pd.DataFrame(), gr.update(visible=False)
916
-
917
- total_repos = len(repo_ids)
918
-
919
- try:
920
- # Start the progress tracking
921
- progress(0, desc="Initializing batch analysis...")
922
-
923
- successful_analyses = 0
924
- failed_analyses = 0
925
- csv_update_failures = 0
926
-
927
- for i, repo_id in enumerate(repo_ids):
928
- # Update progress
929
- progress_percent = (i / total_repos)
930
- progress(progress_percent, desc=f"Analyzing {repo_id} ({i+1}/{total_repos})")
931
-
932
- try:
933
- logger.info(f"Batch analysis: Processing {repo_id} ({i+1}/{total_repos})")
934
-
935
- # Analyze the repository
936
- content, summary, df = analyze_and_update_single_repo(repo_id, user_requirements)
937
-
938
- # Verify the CSV was actually updated by checking if the repo has analysis data
939
- updated_df = read_csv_to_dataframe()
940
- repo_updated = False
941
-
942
- for idx, row in updated_df.iterrows():
943
- if row["repo id"] == repo_id:
944
- # Check if any analysis field is populated
945
- if (row.get("strength", "").strip() or
946
- row.get("weaknesses", "").strip() or
947
- row.get("speciality", "").strip() or
948
- row.get("relevance rating", "").strip()):
949
- repo_updated = True
950
- break
951
-
952
- if repo_updated:
953
- successful_analyses += 1
954
- else:
955
- # CSV update failed - try once more
956
- logger.warning(f"CSV update failed for {repo_id}, attempting retry...")
957
- time.sleep(0.5) # Wait a bit longer
958
-
959
- # Force re-read and re-update
960
- df_retry = read_csv_to_dataframe()
961
- retry_success = False
962
-
963
- # Re-parse the analysis if available
964
- if summary and "JSON extraction: SUCCESS" in summary:
965
- # Extract the analysis from summary - this is a fallback
966
- logger.info(f"Attempting to re-update CSV for {repo_id}")
967
- content_retry, summary_retry, df_retry = analyze_and_update_single_repo(repo_id, user_requirements)
968
-
969
- # Check again
970
- final_df = read_csv_to_dataframe()
971
- for idx, row in final_df.iterrows():
972
- if row["repo id"] == repo_id:
973
- if (row.get("strength", "").strip() or
974
- row.get("weaknesses", "").strip() or
975
- row.get("speciality", "").strip() or
976
- row.get("relevance rating", "").strip()):
977
- retry_success = True
978
- break
979
-
980
- if retry_success:
981
- successful_analyses += 1
982
- else:
983
- csv_update_failures += 1
984
-
985
- # Longer delay to prevent file conflicts
986
- time.sleep(0.3)
987
-
988
- except Exception as e:
989
- logger.error(f"Error analyzing {repo_id}: {e}")
990
- failed_analyses += 1
991
- # Still wait to prevent rapid failures
992
- time.sleep(0.2)
993
-
994
- # Complete the progress
995
- progress(1.0, desc="Batch analysis completed!")
996
-
997
- # Get final updated dataframe
998
- updated_df = read_csv_to_dataframe()
999
-
1000
- # Filter out rows with no analysis data for consistent display with top 3
1001
- analyzed_df = updated_df.copy()
1002
- analyzed_df = analyzed_df[
1003
- (analyzed_df['strength'].str.strip() != '') |
1004
- (analyzed_df['weaknesses'].str.strip() != '') |
1005
- (analyzed_df['speciality'].str.strip() != '') |
1006
- (analyzed_df['relevance rating'].str.strip() != '')
1007
- ]
1008
-
1009
- # Get top 3 most relevant repositories using full data
1010
- top_repos = get_top_relevant_repos(updated_df, user_requirements, top_n=3)
1011
-
1012
- # Final status with detailed breakdown
1013
- final_status = f"πŸŽ‰ Batch Analysis Complete!\nβœ… Successful: {successful_analyses}/{total_repos}\n❌ Failed: {failed_analyses}/{total_repos}"
1014
- if csv_update_failures > 0:
1015
- final_status += f"\n⚠️ CSV Update Issues: {csv_update_failures}/{total_repos}"
1016
-
1017
- # Add top repos info if available
1018
- if not top_repos.empty:
1019
- final_status += f"\n\nπŸ† Top {len(top_repos)} most relevant repositories selected!"
1020
-
1021
- # Show top repos section if we have results
1022
- show_top_section = gr.update(visible=not top_repos.empty)
1023
-
1024
- logger.info(f"Batch analysis completed: {successful_analyses} successful, {failed_analyses} failed, {csv_update_failures} CSV update issues")
1025
- return format_dataframe_for_display(analyzed_df), final_status, format_dataframe_for_display(top_repos), show_top_section
1026
-
1027
- except Exception as e:
1028
- logger.error(f"Error in batch analysis: {e}")
1029
- error_status = f"❌ Batch analysis failed: {e}"
1030
- return format_dataframe_for_display(read_csv_to_dataframe()), error_status, pd.DataFrame(), gr.update(visible=False)
1031
-
1032
- def handle_visit_repo(repo_id: str) -> Tuple[Any, str]:
1033
- """Handle visiting the Hugging Face Space for the repository."""
1034
- if repo_id and repo_id.strip():
1035
- hf_url = f"https://huggingface.co/spaces/{repo_id.strip()}"
1036
- logger.info(f"User chose to visit: {hf_url}")
1037
- return gr.update(visible=False), hf_url
1038
- return gr.update(visible=False), ""
1039
-
1040
- def handle_explore_repo(selected_repo_id: str) -> Tuple[Any, Any, Any]:
1041
- """Handle navigating to the repo explorer and populate the repo ID."""
1042
- logger.info(f"DEBUG: handle_explore_repo called with selected_repo_id: '{selected_repo_id}'")
1043
- logger.info(f"DEBUG: selected_repo_id type: {type(selected_repo_id)}")
1044
- logger.info(f"DEBUG: selected_repo_id length: {len(selected_repo_id) if selected_repo_id else 'None'}")
1045
-
1046
- if selected_repo_id and selected_repo_id.strip() and selected_repo_id.strip() != 'nan':
1047
- clean_repo_id = selected_repo_id.strip()
1048
- return (
1049
- gr.update(visible=False), # close modal
1050
- gr.update(selected="repo_explorer_tab"), # switch tab
1051
- gr.update(value=clean_repo_id) # populate repo explorer input
1052
- )
1053
- else:
1054
- return (
1055
- gr.update(visible=False), # close modal
1056
- gr.update(selected="repo_explorer_tab"), # switch tab
1057
- gr.update() # don't change repo explorer input
1058
- )
1059
-
1060
- def handle_cancel_modal() -> Any:
1061
- """Handle closing the modal."""
1062
- return gr.update(visible=False)
1063
-
1064
- def handle_close_text_modal() -> Any:
1065
- """Handle closing the text expansion modal."""
1066
- return gr.update(visible=False)
1067
-
1068
- def handle_reset_everything() -> Tuple[List[str], int, str, pd.DataFrame, pd.DataFrame, Any, Any, Any, List[Dict[str, str]], str, str, str]:
1069
- """Reset everything to initial state - clear all data, CSV, and UI components."""
1070
- try:
1071
- # Clear the CSV file
1072
- if os.path.exists(CSV_FILE):
1073
- os.remove(CSV_FILE)
1074
- logger.info("CSV file deleted for reset")
1075
-
1076
- # Create empty dataframe
1077
- empty_df = pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
1078
-
1079
- # Reset state variables
1080
- repo_ids_reset = []
1081
- current_idx_reset = 0
1082
- user_requirements_reset = ""
1083
-
1084
- # Reset status
1085
- status_reset = "Status: Everything has been reset. Ready to start fresh!"
1086
-
1087
- # Reset UI components
1088
- current_requirements_reset = "No requirements extracted yet."
1089
- extracted_keywords_reset = ""
1090
-
1091
- # Reset chatbot to initial message
1092
- chatbot_reset = [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}]
1093
-
1094
- logger.info("Complete system reset performed")
1095
-
1096
- return (
1097
- repo_ids_reset, # repo_ids_state
1098
- current_idx_reset, # current_repo_idx_state
1099
- user_requirements_reset, # user_requirements_state
1100
- empty_df, # df_output
1101
- empty_df, # top_repos_df
1102
- gr.update(visible=False), # top_repos_section
1103
- gr.update(visible=False), # repo_action_modal
1104
- gr.update(visible=False), # text_expansion_modal
1105
- chatbot_reset, # chatbot
1106
- status_reset, # status_box_analysis
1107
- current_requirements_reset, # current_requirements_display
1108
- extracted_keywords_reset # extracted_keywords_output
1109
- )
1110
-
1111
- except Exception as e:
1112
- logger.error(f"Error during reset: {e}")
1113
- error_status = f"Reset failed: {e}"
1114
- return (
1115
- [], # repo_ids_state
1116
- 0, # current_repo_idx_state
1117
- "", # user_requirements_state
1118
- pd.DataFrame(), # df_output
1119
- pd.DataFrame(), # top_repos_df
1120
- gr.update(visible=False), # top_repos_section
1121
- gr.update(visible=False), # repo_action_modal
1122
- gr.update(visible=False), # text_expansion_modal
1123
- [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}], # chatbot
1124
- error_status, # status_box_analysis
1125
- "No requirements extracted yet.", # current_requirements_display
1126
- "" # extracted_keywords_output
1127
- )
1128
-
1129
- # --- Component Event Wiring ---
1130
-
1131
- # Initialize chatbot with welcome message on app load
1132
- app.load(
1133
- fn=lambda: [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}],
1134
- outputs=[chatbot]
1135
- )
1136
-
1137
- # Input Tab
1138
- submit_repo_btn.click(
1139
- fn=handle_repo_id_submission,
1140
- inputs=[repo_id_input],
1141
- outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_analysis, tabs]
1142
- )
1143
- search_btn.click(
1144
- fn=handle_keyword_search,
1145
- inputs=[keyword_input],
1146
- outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_analysis, tabs]
1147
- )
1148
-
1149
- # Analysis Tab
1150
- analyze_all_btn.click(
1151
- fn=lambda: None, # No need to show progress display since it's commented out
1152
- outputs=[]
1153
- ).then(
1154
- fn=handle_analyze_all_repos,
1155
- inputs=[repo_ids_state, user_requirements_state],
1156
- outputs=[df_output, status_box_analysis, top_repos_df, top_repos_section]
1157
- )
1158
-
1159
- # Chatbot Tab
1160
- msg_input.submit(
1161
- fn=handle_user_message,
1162
- inputs=[msg_input, chatbot],
1163
- outputs=[chatbot, msg_input]
1164
- ).then(
1165
- fn=handle_bot_response,
1166
- inputs=[chatbot],
1167
- outputs=[chatbot]
1168
- )
1169
- send_btn.click(
1170
- fn=handle_user_message,
1171
- inputs=[msg_input, chatbot],
1172
- outputs=[chatbot, msg_input]
1173
- ).then(
1174
- fn=handle_bot_response,
1175
- inputs=[chatbot],
1176
- outputs=[chatbot]
1177
- )
1178
- end_chat_btn.click(
1179
- fn=handle_end_chat,
1180
- inputs=[chatbot],
1181
- outputs=[extracted_keywords_output, status_box_chatbot, user_requirements_state]
1182
- ).then(
1183
- fn=lambda req: req if req.strip() else "No specific requirements extracted from conversation.",
1184
- inputs=[user_requirements_state],
1185
- outputs=[current_requirements_display]
1186
- )
1187
- use_keywords_btn.click(
1188
- fn=handle_keyword_search,
1189
- inputs=[extracted_keywords_output],
1190
- outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_analysis, tabs]
1191
- )
1192
-
1193
- # Repo Explorer Tab
1194
- setup_repo_explorer_events(repo_components, repo_states)
1195
-
1196
- # Modal button events
1197
- visit_repo_btn.click(
1198
- fn=handle_visit_repo,
1199
- inputs=[selected_repo_display],
1200
- outputs=[repo_action_modal, selected_repo_display],
1201
- js="(repo_id) => { if(repo_id && repo_id.trim()) { window.open('https://huggingface.co/spaces/' + repo_id.trim(), '_blank'); } }"
1202
- )
1203
- explore_repo_btn.click(
1204
- fn=handle_explore_repo,
1205
- inputs=[selected_repo_id_state],
1206
- outputs=[
1207
- repo_action_modal,
1208
- tabs,
1209
- repo_components["repo_explorer_input"]
1210
- ],
1211
- js="""(repo_id) => {
1212
- console.log('DEBUG: Navigate to repo explorer for:', repo_id);
1213
- setTimeout(() => {
1214
- window.scrollTo({top: 0, behavior: 'smooth'});
1215
- }, 200);
1216
- }"""
1217
- )
1218
- cancel_modal_btn.click(
1219
- fn=handle_cancel_modal,
1220
- outputs=[repo_action_modal]
1221
- )
1222
-
1223
- # Text expansion modal events
1224
- close_text_modal_btn.click(
1225
- fn=handle_close_text_modal,
1226
- outputs=[text_expansion_modal]
1227
- )
1228
-
1229
- # Add dataframe selection event
1230
- df_output.select(
1231
- fn=handle_dataframe_select,
1232
- inputs=[df_output],
1233
- outputs=[selected_repo_display, repo_action_modal, tabs, expanded_content_title, expanded_content_text, text_expansion_modal, selected_repo_id_state]
1234
- )
1235
-
1236
- # Add selection event for top repositories dataframe too
1237
- top_repos_df.select(
1238
- fn=handle_dataframe_select,
1239
- inputs=[top_repos_df],
1240
- outputs=[selected_repo_display, repo_action_modal, tabs, expanded_content_title, expanded_content_text, text_expansion_modal, selected_repo_id_state]
1241
- )
1242
-
1243
- # Reset button event
1244
- reset_all_btn.click(
1245
- fn=handle_reset_everything,
1246
- outputs=[repo_ids_state, current_repo_idx_state, user_requirements_state, df_output, top_repos_df, top_repos_section, repo_action_modal, text_expansion_modal, chatbot, status_box_analysis, current_requirements_display, extracted_keywords_output]
1247
- )
1248
-
1249
- return app
1250
-
1251
- if __name__ == "__main__":
1252
- app = create_ui()
1253
- app.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
repo_explorer_old.py DELETED
@@ -1,200 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import logging
4
- from typing import List, Dict, Tuple
5
- from analyzer import combine_repo_files_for_llm, handle_load_repository
6
- from hf_utils import download_filtered_space_files
7
-
8
- # Setup logger
9
- logger = logging.getLogger(__name__)
10
-
11
- def create_repo_explorer_tab() -> Tuple[Dict[str, gr.components.Component], Dict[str, gr.State]]:
12
- """
13
- Creates the Repo Explorer tab content and returns the component references and state variables.
14
- """
15
-
16
- # State variables for repo explorer
17
- states = {
18
- "repo_context_summary": gr.State(""),
19
- "current_repo_id": gr.State("")
20
- }
21
-
22
- gr.Markdown("### πŸ—‚οΈ Deep Dive into a Specific Repository")
23
-
24
- with gr.Row():
25
- with gr.Column(scale=2):
26
- repo_explorer_input = gr.Textbox(
27
- label="πŸ“ Repository ID",
28
- placeholder="microsoft/DialoGPT-medium",
29
- info="Enter a Hugging Face repository ID to explore"
30
- )
31
- with gr.Column(scale=1):
32
- load_repo_btn = gr.Button("πŸš€ Load Repository", variant="primary", size="lg")
33
-
34
- with gr.Row():
35
- repo_status_display = gr.Textbox(
36
- label="πŸ“Š Repository Status",
37
- interactive=False,
38
- lines=3,
39
- info="Current repository loading status and basic info"
40
- )
41
-
42
- with gr.Row():
43
- with gr.Column(scale=2):
44
- repo_chatbot = gr.Chatbot(
45
- label="πŸ€– Repository Assistant",
46
- height=400,
47
- type="messages",
48
- avatar_images=(
49
- "https://cdn-icons-png.flaticon.com/512/149/149071.png",
50
- "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"
51
- ),
52
- show_copy_button=True,
53
- value=[] # Start empty - welcome message will appear only after repo is loaded
54
- )
55
-
56
- with gr.Row():
57
- repo_msg_input = gr.Textbox(
58
- label="πŸ’­ Ask about this repository",
59
- placeholder="What does this repository do? How do I use it?",
60
- lines=1,
61
- scale=4,
62
- info="Ask anything about the loaded repository"
63
- )
64
- repo_send_btn = gr.Button("πŸ“€ Send", variant="primary", scale=1)
65
-
66
- # with gr.Column(scale=1):
67
- # # Repository content preview
68
- # repo_content_display = gr.Textbox(
69
- # label="πŸ“„ Repository Content Preview",
70
- # lines=20,
71
- # show_copy_button=True,
72
- # interactive=False,
73
- # info="Overview of the loaded repository structure and content"
74
- # )
75
-
76
- # Component references
77
- components = {
78
- "repo_explorer_input": repo_explorer_input,
79
- "load_repo_btn": load_repo_btn,
80
- "repo_status_display": repo_status_display,
81
- "repo_chatbot": repo_chatbot,
82
- "repo_msg_input": repo_msg_input,
83
- "repo_send_btn": repo_send_btn,
84
- # "repo_content_display": repo_content_display
85
- }
86
-
87
- return components, states
88
-
89
- def handle_repo_user_message(user_message: str, history: List[Dict[str, str]], repo_context_summary: str, repo_id: str) -> Tuple[List[Dict[str, str]], str]:
90
- """Handle user messages in the repo-specific chatbot."""
91
- if not repo_context_summary.strip():
92
- return history, ""
93
-
94
- # Initialize with repository-specific welcome message if empty
95
- if not history:
96
- welcome_msg = f"Hello! I'm your assistant for the '{repo_id}' repository. I have analyzed all the files and created a comprehensive understanding of this repository. I'm ready to answer any questions about its functionality, usage, architecture, and more. What would you like to know?"
97
- history = [{"role": "assistant", "content": welcome_msg}]
98
-
99
- if user_message:
100
- history.append({"role": "user", "content": user_message})
101
- return history, ""
102
-
103
- def handle_repo_bot_response(history: List[Dict[str, str]], repo_context_summary: str, repo_id: str) -> List[Dict[str, str]]:
104
- """Generate bot response for repo-specific questions using comprehensive context."""
105
- if not history or history[-1]["role"] != "user" or not repo_context_summary.strip():
106
- return history
107
-
108
- user_message = history[-1]["content"]
109
-
110
- # Create a specialized prompt using the comprehensive context summary
111
- repo_system_prompt = f"""You are an expert assistant for the Hugging Face repository '{repo_id}'.
112
- You have comprehensive knowledge about this repository based on detailed analysis of all its files and components.
113
-
114
- Use the following comprehensive analysis to answer user questions accurately and helpfully:
115
-
116
- {repo_context_summary}
117
-
118
- Instructions:
119
- - Answer questions clearly and conversationally about this specific repository
120
- - Reference specific components, functions, or features when relevant
121
- - Provide practical guidance on installation, usage, and implementation
122
- - If asked about code details, refer to the analysis above
123
- - Be helpful and informative while staying focused on this repository
124
- - If something isn't covered in the analysis, acknowledge the limitation
125
-
126
- Answer the user's question based on your comprehensive knowledge of this repository."""
127
-
128
- try:
129
- from openai import OpenAI
130
- client = OpenAI(api_key=os.getenv("modal_api"))
131
- client.base_url = os.getenv("base_url")
132
-
133
- response = client.chat.completions.create(
134
- model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
135
- messages=[
136
- {"role": "system", "content": repo_system_prompt},
137
- {"role": "user", "content": user_message}
138
- ],
139
- max_tokens=1024,
140
- temperature=0.7
141
- )
142
-
143
- bot_response = response.choices[0].message.content
144
- history.append({"role": "assistant", "content": bot_response})
145
-
146
- except Exception as e:
147
- logger.error(f"Error generating repo bot response: {e}")
148
- error_response = f"I apologize, but I encountered an error while processing your question: {e}"
149
- history.append({"role": "assistant", "content": error_response})
150
-
151
- return history
152
-
153
- def initialize_repo_chatbot(repo_status: str, repo_id: str, repo_context_summary: str) -> List[Dict[str, str]]:
154
- """Initialize the repository chatbot with a welcome message after successful repo loading."""
155
- # Only initialize if repository was loaded successfully
156
- if repo_context_summary.strip() and "successfully" in repo_status.lower():
157
- welcome_msg = f"πŸ‘‹ Welcome! I've successfully analyzed the **{repo_id}** repository.\n\n🧠 **I now have comprehensive knowledge of:**\nβ€’ All files and code structure\nβ€’ Key features and capabilities\nβ€’ Installation and usage instructions\nβ€’ Architecture and implementation details\nβ€’ Dependencies and requirements\n\nπŸ’¬ **Ask me anything about this repository!** \nFor example:\nβ€’ \"What does this repository do?\"\nβ€’ \"How do I install and use it?\"\nβ€’ \"What are the main components?\"\nβ€’ \"Show me usage examples\"\n\nWhat would you like to know? πŸ€”"
158
- return [{"role": "assistant", "content": welcome_msg}]
159
- else:
160
- # Keep chatbot empty if loading failed
161
- return []
162
-
163
- def setup_repo_explorer_events(components: Dict[str, gr.components.Component], states: Dict[str, gr.State]):
164
- """Setup event handlers for the repo explorer components."""
165
-
166
- # Load repository event
167
- components["load_repo_btn"].click(
168
- fn=handle_load_repository,
169
- inputs=[components["repo_explorer_input"]],
170
- outputs=[components["repo_status_display"], states["repo_context_summary"]]
171
- ).then(
172
- fn=lambda repo_id: repo_id,
173
- inputs=[components["repo_explorer_input"]],
174
- outputs=[states["current_repo_id"]]
175
- ).then(
176
- fn=initialize_repo_chatbot,
177
- inputs=[components["repo_status_display"], states["current_repo_id"], states["repo_context_summary"]],
178
- outputs=[components["repo_chatbot"]]
179
- )
180
-
181
- # Chat message submission events
182
- components["repo_msg_input"].submit(
183
- fn=handle_repo_user_message,
184
- inputs=[components["repo_msg_input"], components["repo_chatbot"], states["repo_context_summary"], states["current_repo_id"]],
185
- outputs=[components["repo_chatbot"], components["repo_msg_input"]]
186
- ).then(
187
- fn=handle_repo_bot_response,
188
- inputs=[components["repo_chatbot"], states["repo_context_summary"], states["current_repo_id"]],
189
- outputs=[components["repo_chatbot"]]
190
- )
191
-
192
- components["repo_send_btn"].click(
193
- fn=handle_repo_user_message,
194
- inputs=[components["repo_msg_input"], components["repo_chatbot"], states["repo_context_summary"], states["current_repo_id"]],
195
- outputs=[components["repo_chatbot"], components["repo_msg_input"]]
196
- ).then(
197
- fn=handle_repo_bot_response,
198
- inputs=[components["repo_chatbot"], states["repo_context_summary"], states["current_repo_id"]],
199
- outputs=[components["repo_chatbot"]]
200
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test.py DELETED
@@ -1,23 +0,0 @@
1
- """This simple script shows how to interact with an OpenAI-compatible server from a client."""
2
-
3
- # import argparse
4
-
5
- # import modal
6
- from openai import OpenAI
7
- import os
8
-
9
- client = OpenAI(api_key=os.getenv("modal_api"))
10
- client.base_url = (
11
- "https://alexprincecursor--example-vllm-openai-compatible-serve.modal.run/v1/"
12
- )
13
-
14
- response = client.chat.completions.create(
15
- model="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", # GPT-4.1 mini
16
- messages=[
17
- {"role": "system", "content": "You are a rockstar lyric generator. You are given a song and you need to generate a lyric for it."},
18
- {"role": "user", "content":"The song is 'Bohemian Rhapsody' by Queen."}
19
- ],
20
- max_tokens=512,
21
- temperature=0.7
22
- )
23
- print(response.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_vectorization.py DELETED
@@ -1,135 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Simple test script to verify vectorization functionality.
4
- Run this to check if sentence-transformers is working correctly.
5
- """
6
-
7
- import os
8
- import sys
9
-
10
- def test_vectorization():
11
- """Test the vectorization functionality."""
12
- print("πŸ§ͺ Testing vectorization functionality...")
13
-
14
- # Test 1: Import dependencies
15
- print("\n1. Testing imports...")
16
- try:
17
- import numpy as np
18
- print("βœ… numpy imported successfully")
19
- except ImportError as e:
20
- print(f"❌ numpy import failed: {e}")
21
- return False
22
-
23
- try:
24
- from sentence_transformers import SentenceTransformer
25
- print("βœ… sentence-transformers imported successfully")
26
- except ImportError as e:
27
- print(f"❌ sentence-transformers import failed: {e}")
28
- print("Install with: pip install sentence-transformers")
29
- return False
30
-
31
- # Test 2: Load model
32
- print("\n2. Testing model loading...")
33
- try:
34
- model = SentenceTransformer('all-MiniLM-L6-v2')
35
- print("βœ… SentenceTransformer model loaded successfully")
36
- except Exception as e:
37
- print(f"❌ Model loading failed: {e}")
38
- return False
39
-
40
- # Test 3: Create embeddings
41
- print("\n3. Testing embedding creation...")
42
- try:
43
- test_texts = [
44
- "This is a Python function for machine learning",
45
- "Here's a repository configuration file",
46
- "Installation instructions for the project"
47
- ]
48
- embeddings = model.encode(test_texts)
49
- print(f"βœ… Created embeddings with shape: {embeddings.shape}")
50
- except Exception as e:
51
- print(f"❌ Embedding creation failed: {e}")
52
- return False
53
-
54
- # Test 4: Test similarity calculation
55
- print("\n4. Testing similarity calculation...")
56
- try:
57
- query_embedding = model.encode(["Python code example"])
58
- similarities = []
59
- for embedding in embeddings:
60
- similarity = np.dot(query_embedding[0], embedding) / (
61
- np.linalg.norm(query_embedding[0]) * np.linalg.norm(embedding)
62
- )
63
- similarities.append(similarity)
64
- print(f"βœ… Similarity scores: {[f'{s:.3f}' for s in similarities]}")
65
- except Exception as e:
66
- print(f"❌ Similarity calculation failed: {e}")
67
- return False
68
-
69
- # Test 5: Test repo_explorer integration
70
- print("\n5. Testing repo_explorer integration...")
71
- try:
72
- from repo_explorer import SimpleVectorStore, vectorize_repository_content
73
-
74
- # Create test repository content
75
- test_repo_content = """# Test Repository
76
- import numpy as np
77
- import pandas as pd
78
-
79
- def main():
80
- print("Hello, world!")
81
-
82
- class DataProcessor:
83
- def __init__(self):
84
- self.data = []
85
-
86
- def process(self, data):
87
- return data.upper()
88
-
89
- if __name__ == "__main__":
90
- main()
91
- """
92
-
93
- # Test vectorization
94
- success = vectorize_repository_content(test_repo_content, "test/repo")
95
- if success:
96
- print("βœ… Repository vectorization successful")
97
-
98
- # Test vector store
99
- from repo_explorer import vector_store
100
- stats = vector_store.get_stats()
101
- print(f"βœ… Vector store stats: {stats}")
102
-
103
- # Test search
104
- results = vector_store.search("Python function", top_k=2)
105
- if results:
106
- print(f"βœ… Vector search returned {len(results)} results")
107
- for i, (chunk, similarity, metadata) in enumerate(results):
108
- print(f" Result {i+1}: similarity={similarity:.3f}")
109
- else:
110
- print("⚠️ Vector search returned no results")
111
- else:
112
- print("❌ Repository vectorization failed")
113
- return False
114
-
115
- except Exception as e:
116
- print(f"❌ repo_explorer integration test failed: {e}")
117
- return False
118
-
119
- print("\nπŸŽ‰ All tests passed! Vectorization is working correctly.")
120
- return True
121
-
122
- if __name__ == "__main__":
123
- print("Repository Explorer Vectorization Test")
124
- print("=" * 45)
125
-
126
- success = test_vectorization()
127
-
128
- if success:
129
- print("\nβœ… Ready to use vectorization in repo explorer!")
130
- print(" The sentence-transformers model will be downloaded on first use.")
131
- else:
132
- print("\n❌ Vectorization setup incomplete.")
133
- print(" Make sure to install: pip install sentence-transformers numpy")
134
-
135
- sys.exit(0 if success else 1)