Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import regex as re | |
| import csv | |
| import pandas as pd | |
| from typing import List, Dict, Tuple, Any | |
| import logging | |
| import os | |
| import time | |
| # Import core logic from other modules, as in app_old.py | |
| from analyzer import ( | |
| combine_repo_files_for_llm, | |
| parse_llm_json_response, | |
| analyze_combined_file, | |
| handle_load_repository | |
| ) | |
| from hf_utils import download_filtered_space_files, search_top_spaces | |
| from chatbot_page import chat_with_user, extract_keywords_from_conversation | |
| from repo_explorer import create_repo_explorer_tab, setup_repo_explorer_events, initialize_repo_chatbot | |
| # --- Configuration --- | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| CSV_FILE = "repo_ids.csv" | |
| CHATBOT_SYSTEM_PROMPT = ( | |
| "You are a helpful assistant whose ONLY job is to gather information about the user's ideal repository requirements. " | |
| "DO NOT suggest any specific repositories or give repository recommendations. " | |
| "Your role is to ask clarifying questions to understand exactly what the user is looking for. " | |
| "Ask about their use case, preferred programming language, specific features needed, project type, etc. " | |
| "When you feel you have gathered enough detailed information about their requirements, " | |
| "tell the user: 'I think I have enough information about your requirements. I'll now search for relevant repositories automatically.' " | |
| "Focus on understanding their needs, not providing solutions." | |
| ) | |
| CHATBOT_INITIAL_MESSAGE = "Hello! I'm here to help you find the perfect Hugging Face repository. Tell me about your project - what are you trying to build? I'll ask some questions to understand your needs and then automatically find relevant repositories for you." | |
| # --- Helper Functions (Logic) --- | |
| def is_repo_id_format(text: str) -> bool: | |
| """Check if text looks like repository IDs (contains forward slashes).""" | |
| lines = [line.strip() for line in re.split(r'[\n,]+', text) if line.strip()] | |
| if not lines: | |
| return False | |
| # If most lines contain forward slashes, treat as repo IDs | |
| slash_count = sum(1 for line in lines if '/' in line) | |
| return slash_count >= len(lines) * 0.5 # At least 50% have slashes | |
| def should_auto_extract_keywords(history: List[Dict[str, str]]) -> bool: | |
| """Determine if we should automatically extract keywords from conversation.""" | |
| if not history or len(history) < 4: # Need at least 2 exchanges | |
| return False | |
| # Check if the last assistant message suggests we have enough info | |
| last_assistant_msg = "" | |
| for msg in reversed(history): | |
| if msg.get('role') == 'assistant': | |
| last_assistant_msg = msg.get('content', '').lower() | |
| break | |
| # Look for key phrases that indicate readiness | |
| ready_phrases = [ | |
| "enough information", | |
| "search for repositories", | |
| "find repositories", | |
| "look for repositories", | |
| "automatically", | |
| "ready to search" | |
| ] | |
| return any(phrase in last_assistant_msg for phrase in ready_phrases) | |
| def get_top_relevant_repos(df: pd.DataFrame, user_requirements: str, top_n: int = 3) -> pd.DataFrame: | |
| """ | |
| Uses LLM to select the top 3 most relevant repositories based on user requirements and analysis data. | |
| """ | |
| try: | |
| if df.empty: | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| # Filter out rows with no analysis data | |
| analyzed_df = df.copy() | |
| analyzed_df = analyzed_df[ | |
| (analyzed_df['strength'].str.strip() != '') | | |
| (analyzed_df['weaknesses'].str.strip() != '') | | |
| (analyzed_df['speciality'].str.strip() != '') | | |
| (analyzed_df['relevance rating'].str.strip() != '') | |
| ] | |
| if analyzed_df.empty: | |
| logger.warning("No analyzed repositories found for LLM selection") | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| # Create a prompt for the LLM | |
| csv_data = "" | |
| for idx, row in analyzed_df.iterrows(): | |
| csv_data += f"Repository: {row['repo id']}\n" | |
| csv_data += f"Strengths: {row['strength']}\n" | |
| csv_data += f"Weaknesses: {row['weaknesses']}\n" | |
| csv_data += f"Speciality: {row['speciality']}\n" | |
| csv_data += f"Relevance: {row['relevance rating']}\n\n" | |
| user_context = user_requirements if user_requirements.strip() else "General repository recommendation" | |
| prompt = f"""Based on the user's requirements and the analysis of repositories below, select the top {top_n} most relevant repositories. | |
| User Requirements: | |
| {user_context} | |
| Repository Analysis Data: | |
| {csv_data} | |
| Please analyze all repositories and select the {top_n} most relevant ones based on: | |
| 1. How well they match the user's specific requirements | |
| 2. Their strengths and capabilities | |
| 3. Their relevance rating | |
| 4. Their speciality alignment with user needs | |
| Return ONLY a JSON list of the repository IDs in order of relevance (most relevant first). Example format: | |
| ["repo1", "repo2", "repo3"] | |
| Selected repositories:""" | |
| try: | |
| from openai import OpenAI | |
| client = OpenAI(api_key=os.getenv("modal_api")) | |
| client.base_url = os.getenv("base_url") | |
| response = client.chat.completions.create( | |
| model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", | |
| messages=[ | |
| {"role": "system", "content": "You are an expert at analyzing and ranking repositories based on user requirements. Always return valid JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=200, | |
| temperature=0.3 | |
| ) | |
| llm_response = response.choices[0].message.content.strip() | |
| logger.info(f"LLM response for top repos: {llm_response}") | |
| # Extract JSON from response | |
| import json | |
| import re | |
| # Try to find JSON array in the response | |
| json_match = re.search(r'\[.*\]', llm_response) | |
| if json_match: | |
| selected_repos = json.loads(json_match.group()) | |
| logger.info(f"LLM selected repositories: {selected_repos}") | |
| # Filter dataframe to only include selected repositories in order | |
| top_repos_list = [] | |
| for repo_id in selected_repos[:top_n]: | |
| matching_rows = analyzed_df[analyzed_df['repo id'] == repo_id] | |
| if not matching_rows.empty: | |
| top_repos_list.append(matching_rows.iloc[0]) | |
| if top_repos_list: | |
| top_repos = pd.DataFrame(top_repos_list) | |
| logger.info(f"Successfully selected {len(top_repos)} repositories using LLM") | |
| return top_repos | |
| # Fallback: if LLM response parsing fails, use first N analyzed repos | |
| logger.warning("Failed to parse LLM response, using fallback selection") | |
| return analyzed_df.head(top_n) | |
| except Exception as llm_error: | |
| logger.error(f"LLM selection failed: {llm_error}") | |
| # Fallback: return first N repositories with analysis data | |
| return analyzed_df.head(top_n) | |
| except Exception as e: | |
| logger.error(f"Error in LLM-based repo selection: {e}") | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| def write_repos_to_csv(repo_ids: List[str]) -> None: | |
| """Writes a list of repo IDs to the CSV file, overwriting the previous content.""" | |
| try: | |
| with open(CSV_FILE, mode="w", newline='', encoding="utf-8") as csvfile: | |
| writer = csv.writer(csvfile) | |
| writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| for repo_id in repo_ids: | |
| writer.writerow([repo_id, "", "", "", ""]) | |
| logger.info(f"Wrote {len(repo_ids)} repo IDs to {CSV_FILE}") | |
| except Exception as e: | |
| logger.error(f"Error writing to CSV: {e}") | |
| def format_text_for_dataframe(text: str, max_length: int = 200) -> str: | |
| """Format text for better display in dataframe by truncating and cleaning.""" | |
| if not text or pd.isna(text): | |
| return "" | |
| # Clean the text | |
| text = str(text).strip() | |
| # Remove excessive whitespace and newlines | |
| text = re.sub(r'\s+', ' ', text) | |
| # Truncate if too long | |
| if len(text) > max_length: | |
| text = text[:max_length-3] + "..." | |
| return text | |
| def read_csv_to_dataframe() -> pd.DataFrame: | |
| """Reads the CSV file into a pandas DataFrame with full text preserved.""" | |
| try: | |
| df = pd.read_csv(CSV_FILE, dtype=str).fillna('') | |
| # Keep the full text intact - don't truncate here | |
| # The truncation will be handled in the UI display layer | |
| return df | |
| except FileNotFoundError: | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| except Exception as e: | |
| logger.error(f"Error reading CSV: {e}") | |
| return pd.DataFrame() | |
| def format_dataframe_for_display(df: pd.DataFrame) -> pd.DataFrame: | |
| """Returns dataframe with full text (no truncation) for display.""" | |
| if df.empty: | |
| return df | |
| # Return the dataframe as-is without any text truncation | |
| # This will show the full text content in the CSV display | |
| return df.copy() | |
| def analyze_and_update_single_repo(repo_id: str, user_requirements: str = "") -> Tuple[str, str, pd.DataFrame]: | |
| """ | |
| Downloads, analyzes a single repo, updates the CSV, and returns results. | |
| Now includes user requirements for better relevance rating. | |
| This function combines the logic of downloading, analyzing, and updating the CSV for one repo. | |
| """ | |
| try: | |
| logger.info(f"Starting analysis for repo: {repo_id}") | |
| download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=['.py', '.md', '.txt']) | |
| txt_path = combine_repo_files_for_llm() | |
| with open(txt_path, "r", encoding="utf-8") as f: | |
| combined_content = f.read() | |
| llm_output = analyze_combined_file(txt_path, user_requirements) | |
| last_start = llm_output.rfind('{') | |
| last_end = llm_output.rfind('}') | |
| final_json_str = llm_output[last_start:last_end+1] if last_start != -1 and last_end != -1 else "{}" | |
| llm_json = parse_llm_json_response(final_json_str) | |
| summary = "" | |
| if isinstance(llm_json, dict) and "error" not in llm_json: | |
| strengths = llm_json.get("strength", "N/A") | |
| weaknesses = llm_json.get("weaknesses", "N/A") | |
| relevance = llm_json.get("relevance rating", "N/A") | |
| summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}\n\nRelevance: {relevance}" | |
| else: | |
| summary = f"JSON extraction: FAILED\nRaw response might not be valid JSON." | |
| # Update CSV | |
| df = read_csv_to_dataframe() | |
| repo_found_in_df = False | |
| for idx, row in df.iterrows(): | |
| if row["repo id"] == repo_id: | |
| if isinstance(llm_json, dict): | |
| df.at[idx, "strength"] = llm_json.get("strength", "") | |
| df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "") | |
| df.at[idx, "speciality"] = llm_json.get("speciality", "") | |
| df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "") | |
| repo_found_in_df = True | |
| break | |
| if not repo_found_in_df: | |
| logger.warning(f"Repo ID {repo_id} not found in CSV for updating.") | |
| # Write CSV with better error handling and flushing | |
| try: | |
| df.to_csv(CSV_FILE, index=False) | |
| # Force file system flush | |
| os.sync() if hasattr(os, 'sync') else None | |
| logger.info(f"Successfully updated CSV for {repo_id}") | |
| except Exception as csv_error: | |
| logger.error(f"Failed to write CSV for {repo_id}: {csv_error}") | |
| # Try once more with a small delay | |
| time.sleep(0.2) | |
| try: | |
| df.to_csv(CSV_FILE, index=False) | |
| logger.info(f"Successfully updated CSV for {repo_id} on retry") | |
| except Exception as retry_error: | |
| logger.error(f"Failed to write CSV for {repo_id} on retry: {retry_error}") | |
| logger.info(f"Successfully analyzed and updated CSV for {repo_id}") | |
| return combined_content, summary, df | |
| except Exception as e: | |
| logger.error(f"An error occurred during analysis of {repo_id}: {e}") | |
| error_summary = f"Error analyzing repo: {e}" | |
| return "", error_summary, format_dataframe_for_display(read_csv_to_dataframe()) | |
| # --- NEW: Helper for Chat History Conversion --- | |
| def convert_messages_to_tuples(history: List[Dict[str, str]]) -> List[Tuple[str, str]]: | |
| """ | |
| Converts Gradio's 'messages' format to the old 'tuple' format for compatibility. | |
| This robust version correctly handles histories that start with an assistant message. | |
| """ | |
| tuple_history = [] | |
| # Iterate through the history to find user messages | |
| for i, msg in enumerate(history): | |
| if msg['role'] == 'user': | |
| # Once a user message is found, check if the next message is from the assistant | |
| if i + 1 < len(history) and history[i+1]['role'] == 'assistant': | |
| user_content = msg['content'] | |
| assistant_content = history[i+1]['content'] | |
| tuple_history.append((user_content, assistant_content)) | |
| return tuple_history | |
| # --- Gradio UI --- | |
| def create_ui() -> gr.Blocks: | |
| """Creates and configures the entire Gradio interface.""" | |
| css = """ | |
| /* Modern sleek design */ | |
| .gradio-container { | |
| font-family: 'Inter', 'system-ui', sans-serif; | |
| background: linear-gradient(135deg, #0a0a0a 0%, #1a1a1a 100%); | |
| min-height: 100vh; | |
| } | |
| .gr-form { | |
| background: rgba(255, 255, 255, 0.95); | |
| backdrop-filter: blur(10px); | |
| border-radius: 16px; | |
| box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1); | |
| padding: 24px; | |
| margin: 16px; | |
| border: 1px solid rgba(255, 255, 255, 0.2); | |
| } | |
| .gr-button { | |
| background: linear-gradient(45deg, #667eea, #764ba2); | |
| border: none; | |
| border-radius: 12px; | |
| color: white; | |
| font-weight: 600; | |
| padding: 12px 24px; | |
| transition: all 0.3s ease; | |
| box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4); | |
| } | |
| .gr-button:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6); | |
| } | |
| .gr-textbox { | |
| border: 2px solid rgba(102, 126, 234, 0.2); | |
| border-radius: 12px; | |
| background: rgba(255, 255, 255, 0.9); | |
| transition: all 0.3s ease; | |
| } | |
| .gr-textbox:focus { | |
| border-color: #667eea; | |
| box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1); | |
| } | |
| .gr-panel { | |
| background: rgba(255, 255, 255, 0.95); | |
| border-radius: 16px; | |
| box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1); | |
| border: 1px solid rgba(255, 255, 255, 0.2); | |
| } | |
| .gr-tab-nav { | |
| background: rgba(255, 255, 255, 0.95); | |
| border-radius: 12px 12px 0 0; | |
| backdrop-filter: blur(10px); | |
| } | |
| .gr-tab-nav button { | |
| background: transparent; | |
| border: none; | |
| padding: 16px 24px; | |
| font-weight: 600; | |
| color: #666; | |
| transition: all 0.3s ease; | |
| } | |
| .gr-tab-nav button.selected { | |
| background: linear-gradient(45deg, #667eea, #764ba2); | |
| color: white; | |
| border-radius: 8px; | |
| } | |
| .chatbot { | |
| border-radius: 16px; | |
| box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1); | |
| } | |
| /* Hide Gradio footer */ | |
| footer { | |
| display: none !important; | |
| } | |
| /* Custom scrollbar */ | |
| ::-webkit-scrollbar { | |
| width: 8px; | |
| } | |
| ::-webkit-scrollbar-track { | |
| background: rgba(255, 255, 255, 0.1); | |
| border-radius: 4px; | |
| } | |
| ::-webkit-scrollbar-thumb { | |
| background: linear-gradient(45deg, #667eea, #764ba2); | |
| border-radius: 4px; | |
| } | |
| /* Improved dataframe styling for full text display */ | |
| .gr-dataframe { | |
| border-radius: 12px; | |
| overflow: hidden; | |
| box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1); | |
| background: rgba(255, 255, 255, 0.98); | |
| } | |
| .gr-dataframe table { | |
| width: 100%; | |
| table-layout: fixed; | |
| border-collapse: collapse; | |
| } | |
| /* Column width specifications for both dataframes */ | |
| .gr-dataframe th, | |
| .gr-dataframe td { | |
| padding: 12px 16px; | |
| text-align: left; | |
| border-bottom: 1px solid rgba(0, 0, 0, 0.1); | |
| font-size: 0.95rem; | |
| line-height: 1.4; | |
| } | |
| /* Specific column widths - applying to both dataframes */ | |
| .gr-dataframe th:nth-child(1), | |
| .gr-dataframe td:nth-child(1) { width: 16.67% !important; min-width: 16.67% !important; max-width: 16.67% !important; } | |
| .gr-dataframe th:nth-child(2), | |
| .gr-dataframe td:nth-child(2) { width: 25% !important; min-width: 25% !important; max-width: 25% !important; } | |
| .gr-dataframe th:nth-child(3), | |
| .gr-dataframe td:nth-child(3) { width: 25% !important; min-width: 25% !important; max-width: 25% !important; } | |
| .gr-dataframe th:nth-child(4), | |
| .gr-dataframe td:nth-child(4) { width: 20.83% !important; min-width: 20.83% !important; max-width: 20.83% !important; } | |
| .gr-dataframe th:nth-child(5), | |
| .gr-dataframe td:nth-child(5) { width: 12.5% !important; min-width: 12.5% !important; max-width: 12.5% !important; } | |
| /* Additional specific targeting for both dataframes */ | |
| div[data-testid="dataframe"] table th:nth-child(1), | |
| div[data-testid="dataframe"] table td:nth-child(1) { width: 16.67% !important; } | |
| div[data-testid="dataframe"] table th:nth-child(2), | |
| div[data-testid="dataframe"] table td:nth-child(2) { width: 25% !important; } | |
| div[data-testid="dataframe"] table th:nth-child(3), | |
| div[data-testid="dataframe"] table td:nth-child(3) { width: 25% !important; } | |
| div[data-testid="dataframe"] table th:nth-child(4), | |
| div[data-testid="dataframe"] table td:nth-child(4) { width: 20.83% !important; } | |
| div[data-testid="dataframe"] table th:nth-child(5), | |
| div[data-testid="dataframe"] table td:nth-child(5) { width: 12.5% !important; } | |
| /* Make repository names clickable */ | |
| .gr-dataframe td:nth-child(1) { | |
| cursor: pointer; | |
| color: #667eea; | |
| font-weight: 600; | |
| transition: all 0.3s ease; | |
| } | |
| .gr-dataframe td:nth-child(1):hover { | |
| background-color: rgba(102, 126, 234, 0.1); | |
| color: #764ba2; | |
| transform: scale(1.02); | |
| } | |
| /* Content columns - readable styling with scroll for long text */ | |
| .gr-dataframe td:nth-child(2), | |
| .gr-dataframe td:nth-child(3), | |
| .gr-dataframe td:nth-child(4), | |
| .gr-dataframe td:nth-child(5) { | |
| cursor: default; | |
| font-size: 0.9rem; | |
| } | |
| .gr-dataframe tbody tr:hover { | |
| background-color: rgba(102, 126, 234, 0.05); | |
| } | |
| /* JavaScript for auto-scroll to top on tab change */ | |
| <script> | |
| document.addEventListener('DOMContentLoaded', function() { | |
| // Function to scroll to top | |
| function scrollToTop() { | |
| window.scrollTo({ | |
| top: 0, | |
| behavior: 'smooth' | |
| }); | |
| } | |
| // Observer for tab changes | |
| const observer = new MutationObserver(function(mutations) { | |
| mutations.forEach(function(mutation) { | |
| if (mutation.type === 'attributes' && mutation.attributeName === 'class') { | |
| const target = mutation.target; | |
| if (target.classList && target.classList.contains('selected')) { | |
| // Tab was selected, scroll to top | |
| setTimeout(scrollToTop, 100); | |
| } | |
| } | |
| }); | |
| }); | |
| // Observe tab navigation buttons | |
| const tabButtons = document.querySelectorAll('.gr-tab-nav button'); | |
| tabButtons.forEach(button => { | |
| observer.observe(button, { attributes: true }); | |
| // Also add click listener for immediate scroll | |
| button.addEventListener('click', function() { | |
| setTimeout(scrollToTop, 150); | |
| }); | |
| }); | |
| // Enhanced listener for programmatic tab changes (button-triggered navigation) | |
| let lastSelectedTab = null; | |
| const checkInterval = setInterval(function() { | |
| const currentSelectedTab = document.querySelector('.gr-tab-nav button.selected'); | |
| if (currentSelectedTab && currentSelectedTab !== lastSelectedTab) { | |
| lastSelectedTab = currentSelectedTab; | |
| setTimeout(scrollToTop, 100); | |
| } | |
| }, 100); | |
| // Additional scroll trigger for repo explorer navigation | |
| window.addEventListener('repoExplorerNavigation', function() { | |
| setTimeout(scrollToTop, 200); | |
| }); | |
| // Watch for specific tab transitions to repo explorer | |
| const repoExplorerObserver = new MutationObserver(function(mutations) { | |
| mutations.forEach(function(mutation) { | |
| if (mutation.type === 'attributes' && mutation.attributeName === 'class') { | |
| const target = mutation.target; | |
| if (target.textContent && target.textContent.includes('🔍 Repo Explorer') && target.classList.contains('selected')) { | |
| setTimeout(scrollToTop, 150); | |
| } | |
| } | |
| }); | |
| }); | |
| // Start observing for repo explorer specific changes | |
| setTimeout(function() { | |
| const repoExplorerTab = Array.from(document.querySelectorAll('.gr-tab-nav button')).find(btn => | |
| btn.textContent && btn.textContent.includes('🔍 Repo Explorer') | |
| ); | |
| if (repoExplorerTab) { | |
| repoExplorerObserver.observe(repoExplorerTab, { attributes: true }); | |
| } | |
| }, 1000); | |
| }); | |
| </script> | |
| """ | |
| with gr.Blocks( | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="purple", | |
| neutral_hue="gray", | |
| font=["Inter", "system-ui", "sans-serif"] | |
| ), | |
| css=css, | |
| title="🚀 HF Repo Analyzer" | |
| ) as app: | |
| # --- State Management --- | |
| # Using simple, separate state objects for robustness. | |
| repo_ids_state = gr.State([]) | |
| current_repo_idx_state = gr.State(0) | |
| user_requirements_state = gr.State("") # Store user requirements from chatbot | |
| loaded_repo_content_state = gr.State("") # Store loaded repository content | |
| current_repo_id_state = gr.State("") # Store current repository ID | |
| selected_repo_id_state = gr.State("") # Store selected repository ID for modal actions | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center; padding: 40px 20px; background: rgba(255, 255, 255, 0.1); border-radius: 20px; margin: 20px auto; max-width: 900px; backdrop-filter: blur(10px);"> | |
| <h1 style="font-size: 3.5rem; font-weight: 800; margin: 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text;"> | |
| 🚀 HF Repo Analyzer | |
| </h1> | |
| <p style="font-size: 1.3rem; color: rgba(255, 255, 255, 0.9); margin: 16px 0 0 0; font-weight: 400; line-height: 1.6;"> | |
| Discover, analyze, and evaluate Hugging Face repositories with AI-powered insights | |
| </p> | |
| <div style="height: 4px; width: 80px; background: linear-gradient(45deg, #667eea, #764ba2); margin: 24px auto; border-radius: 2px;"></div> | |
| </div> | |
| """ | |
| ) | |
| # Global Reset and Help Buttons - visible on all tabs | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| pass | |
| with gr.Column(scale=2): | |
| with gr.Row(): | |
| help_btn = gr.Button("❓ Help", variant="secondary", size="lg", scale=1) | |
| reset_all_btn = gr.Button("🔄 Reset Everything", variant="stop", size="lg", scale=1) | |
| with gr.Column(scale=1): | |
| pass | |
| # Help Modal - visible when help button is clicked | |
| with gr.Row(): | |
| with gr.Column(): | |
| help_modal = gr.Column(visible=False) | |
| with help_modal: | |
| gr.Markdown( | |
| """ | |
| <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 16px; text-align: center; margin-bottom: 20px;"> | |
| <h2 style="color: white; margin: 0; font-size: 2rem;">📚 How to Use HF Repo Analyzer</h2> | |
| <p style="color: rgba(255,255,255,0.9); margin: 10px 0 0 0;">Step-by-step guide to find and analyze repositories</p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Accordion("🚀 Method 1: AI Assistant (Recommended)", open=True): | |
| gr.Markdown( | |
| """ | |
| ### **Step 1: Start Conversation** | |
| - Go to the **🤖 AI Assistant** tab | |
| - Describe your project: *"I'm building a sentiment analysis tool"* | |
| - The AI will ask clarifying questions about your needs | |
| ### **Step 2: Let AI Work Its Magic** | |
| - Answer the AI's questions about your requirements | |
| - When ready, the AI will automatically: | |
| - Extract keywords from your conversation | |
| - Search for matching repositories | |
| - Analyze and rank them by relevance | |
| ### **Step 3: Review Results** | |
| - Interface automatically switches to **🔬 Analysis & Results** | |
| - View **Top 3** most relevant repositories | |
| - Browse detailed analysis with strengths/weaknesses | |
| - Click repository names to visit or explore them | |
| **💡 Tip**: This method gives the best personalized results! | |
| """ | |
| ) | |
| with gr.Accordion("📝 Method 2: Smart Search (Direct Input)", open=False): | |
| gr.Markdown( | |
| """ | |
| ### **Step 1: Choose Input Type** | |
| Go to **📝 Smart Search** tab and enter either: | |
| **Repository IDs** (with `/`): | |
| ``` | |
| microsoft/DialoGPT-medium | |
| openai/whisper | |
| huggingface/transformers | |
| ``` | |
| **Keywords** (no `/`): | |
| ``` | |
| text generation | |
| image classification | |
| sentiment analysis | |
| ``` | |
| ### **Step 2: Auto-Detection & Processing** | |
| - System automatically detects input type | |
| - Repository IDs → Direct analysis | |
| - Keywords → Search + analysis | |
| - Enable **🚀 Auto-analyze** for instant results | |
| ### **Step 3: Get Results** | |
| - Click **🔍 Find & Process Repositories** | |
| - View results in **🔬 Analysis & Results** tab | |
| """ | |
| ) | |
| with gr.Accordion("🔬 Understanding Analysis Results", open=False): | |
| gr.Markdown( | |
| """ | |
| ### **🏆 Top 3 Repositories** | |
| - AI-selected most relevant for your needs | |
| - Ranked by requirement matching and quality | |
| ### **📊 Detailed Analysis Table** | |
| - **Repository**: Click names to visit/explore | |
| - **Strengths**: Key capabilities and advantages | |
| - **Weaknesses**: Limitations and considerations | |
| - **Speciality**: Primary use case and domain | |
| - **Relevance**: How well it matches your needs | |
| ### **🔗 Quick Actions** | |
| Click repository names to: | |
| - **🌐 Visit Hugging Face Space**: See live demo | |
| - **🔍 Open in Repo Explorer**: Deep dive analysis | |
| """ | |
| ) | |
| with gr.Accordion("🔍 Repository Explorer Deep Dive", open=False): | |
| gr.Markdown( | |
| """ | |
| ### **Access Repository Explorer** | |
| - Click **🔍 Open in Repo Explorer** from results | |
| - Or manually enter repo ID in **🔍 Repo Explorer** tab | |
| ### **Features Available** | |
| - **Auto-loading**: Repository content analysis | |
| - **AI Chat**: Ask questions about the code | |
| - **File Exploration**: Browse repository structure | |
| - **Code Analysis**: Get explanations and insights | |
| ### **Sample Questions to Ask** | |
| - *"How do I use this repository?"* | |
| - *"What are the main functions?"* | |
| - *"Show me example usage"* | |
| - *"Explain the architecture"* | |
| """ | |
| ) | |
| with gr.Accordion("🎯 Pro Tips & Best Practices", open=False): | |
| gr.Markdown( | |
| """ | |
| ### **🤖 Getting Better AI Results** | |
| - Be specific about your use case | |
| - Mention programming language preferences | |
| - Describe your experience level | |
| - Include performance requirements | |
| ### **🔍 Search Optimization** | |
| - Use multiple relevant keywords | |
| - Try different keyword combinations | |
| - Check both general and specific terms | |
| ### **📊 Analyzing Results** | |
| - Read both strengths AND weaknesses | |
| - Check speciality alignment with your needs | |
| - Use Repository Explorer for detailed investigation | |
| - Compare multiple options before deciding | |
| ### **🔄 Workflow Tips** | |
| - Start with AI Assistant for personalized results | |
| - Use Smart Search for known repositories | |
| - Explore multiple repositories before choosing | |
| - Save interesting repositories for later comparison | |
| """ | |
| ) | |
| with gr.Accordion("⚠️ Important Notice: Server Startup Times", open=True): | |
| gr.Markdown( | |
| """ | |
| <div style="background: linear-gradient(135deg, #ff9a56 0%, #ff6b6b 100%); padding: 15px; border-radius: 12px; margin: 10px 0;"> | |
| <h3 style="color: white; margin: 0 0 10px 0; font-size: 1.3rem;">🕐 Model Response Times</h3> | |
| <p style="color: white; margin: 0; font-size: 1rem; line-height: 1.5;"> | |
| <strong>If the AI model takes longer than 5 minutes to respond:</strong><br/> | |
| 📡 The servers are starting up from sleep mode<br/> | |
| ⏳ This happens when the service hasn't been used recently<br/> | |
| 🚀 Once live, responses will be fast and smooth<br/> | |
| 💝 Thank you for your patience! | |
| </p> | |
| </div> | |
| ### **What to Expect** | |
| - **First request**: May take 3-7 minutes (server startup) | |
| - **Subsequent requests**: Fast responses (10-30 seconds) | |
| - **If timeout occurs**: Simply retry your request | |
| ### **Best Practices During Startup** | |
| - Start with a simple conversation or small repository list | |
| - Avoid analyzing many repositories simultaneously on first use | |
| - Once the first response comes through, normal speed resumes | |
| """ | |
| ) | |
| with gr.Row(): | |
| close_help_btn = gr.Button("✅ Got It, Let's Start!", variant="primary", size="lg") | |
| with gr.Tabs() as tabs: | |
| # --- AI Assistant Tab (moved to first) --- | |
| with gr.TabItem("🤖 AI Assistant", id="chatbot_tab"): | |
| gr.Markdown("### 💬 Intelligent Repository Discovery Assistant") | |
| gr.Markdown("🎯 **Tell me what you're building, and I'll automatically find the best repositories for you!**") | |
| chatbot = gr.Chatbot( | |
| label="🤖 AI Assistant", | |
| height=500, | |
| type="messages", | |
| avatar_images=( | |
| "https://cdn-icons-png.flaticon.com/512/149/149071.png", | |
| "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png" | |
| ), | |
| show_copy_button=True | |
| ) | |
| with gr.Row(): | |
| msg_input = gr.Textbox( | |
| label="💭 Your Message", | |
| placeholder="Tell me about your project...", | |
| lines=1, | |
| scale=5, | |
| info="Describe what you're building and I'll find the perfect repositories" | |
| ) | |
| send_btn = gr.Button("📤", variant="primary", scale=1) | |
| with gr.Row(): | |
| extract_analyze_btn = gr.Button("🎯 Extract Keywords & Analyze Now", variant="secondary", size="lg") | |
| # Status and extracted info (auto-updated, no manual buttons needed) | |
| with gr.Row(): | |
| with gr.Column(): | |
| chat_status = gr.Textbox( | |
| label="🎯 Chat Status", | |
| interactive=False, | |
| lines=2, | |
| info="Conversation progress and auto-actions" | |
| ) | |
| # with gr.Column(): | |
| # extracted_keywords_output = gr.Textbox( | |
| # label="🏷️ Auto-Extracted Keywords", | |
| # interactive=False, | |
| # show_copy_button=True, | |
| # info="Keywords automatically extracted and used for search" | |
| # ) | |
| # --- Smart Search Tab (moved to second) --- | |
| with gr.TabItem("📝 Smart Search", id="input_tab"): | |
| gr.Markdown("### 🔍 Intelligent Repository Discovery") | |
| gr.Markdown("💡 **Enter repository IDs (owner/repo) or keywords - I'll automatically detect which type and process accordingly!**") | |
| with gr.Row(): | |
| smart_input = gr.Textbox( | |
| label="Repository IDs or Keywords", | |
| lines=6, | |
| placeholder="Examples:\n• Repository IDs: microsoft/DialoGPT-medium, openai/whisper\n• Keywords: text generation, image classification, sentiment analysis", | |
| info="Smart detection: Use / for repo IDs, or enter keywords for search" | |
| ) | |
| with gr.Row(): | |
| auto_analyze_checkbox = gr.Checkbox( | |
| label="🚀 Auto-analyze repositories", | |
| value=True, | |
| info="Automatically start analysis when repositories are found" | |
| ) | |
| smart_submit_btn = gr.Button("🔍 Find & Process Repositories", variant="primary", size="lg", scale=1) | |
| status_box_input = gr.Textbox(label="📊 Status", interactive=False, lines=2) | |
| # --- Analysis & Results Tab (moved to third) --- | |
| with gr.TabItem("🔬 Analysis & Results", id="analysis_tab"): | |
| gr.Markdown("### 🧪 Repository Analysis Results") | |
| # Display current user requirements | |
| with gr.Row(): | |
| current_requirements_display = gr.Textbox( | |
| label="📋 Active Requirements Context", | |
| interactive=False, | |
| lines=2, | |
| info="Requirements from AI chat for better relevance scoring" | |
| ) | |
| # Manual analysis trigger (hidden by default, shown only when auto-analyze is off) | |
| with gr.Row(visible=False) as manual_analysis_row: | |
| analyze_all_btn = gr.Button("🚀 Analyze All Repositories", variant="primary", size="lg") | |
| status_box_analysis = gr.Textbox(label="📈 Analysis Status", interactive=False, lines=2) | |
| # Progress bar for batch analysis | |
| analysis_progress = gr.Progress() | |
| gr.Markdown("### 📊 Results Dashboard") | |
| # Top 3 Most Relevant Repositories (initially hidden) | |
| with gr.Column(visible=False) as top_repos_section: | |
| gr.Markdown("### 🏆 Top 3 Most Relevant Repositories") | |
| gr.Markdown("🎯 **Click repository names to visit them directly on Hugging Face:**") | |
| top_repos_df = gr.Dataframe( | |
| headers=["Repository", "Strengths", "Weaknesses", "Speciality", "Relevance"], | |
| column_widths=["16.67%", "25%", "25%", "20.83%", "12.5%"], | |
| wrap=True, | |
| interactive=False | |
| ) | |
| # Quick links for top repositories | |
| with gr.Row(): | |
| top_repo_links = gr.HTML( | |
| value="", | |
| label="🔗 Quick Links", | |
| visible=False | |
| ) | |
| # Modal popup for repository action selection (positioned between the two CSV files) | |
| with gr.Row(): | |
| with gr.Column(): | |
| repo_action_modal = gr.Column(visible=False) | |
| with repo_action_modal: | |
| gr.Markdown("### 🔗 Repository Actions") | |
| selected_repo_display = gr.Textbox( | |
| label="Selected Repository", | |
| interactive=False, | |
| info="Choose what you'd like to do with this repository" | |
| ) | |
| with gr.Row(): | |
| visit_repo_btn = gr.Button("🌐 Visit Hugging Face Space", variant="primary", size="lg") | |
| explore_repo_btn = gr.Button("🔍 Open in Repo Explorer", variant="secondary", size="lg") | |
| cancel_modal_btn = gr.Button("❌ Cancel", size="lg") | |
| gr.Markdown("### 📋 All Analysis Results") | |
| gr.Markdown("💡 **Click repository names to visit them on Hugging Face**") | |
| df_output = gr.Dataframe( | |
| headers=["Repository", "Strengths", "Weaknesses", "Speciality", "Relevance"], | |
| column_widths=["16.67%", "25%", "25%", "20.83%", "12.5%"], | |
| wrap=True, | |
| interactive=False | |
| ) | |
| # Quick links section for all repositories | |
| with gr.Row(): | |
| all_repo_links = gr.HTML( | |
| value="", | |
| label="🔗 Repository Quick Links" | |
| ) | |
| # --- Repo Explorer Tab (moved to fourth) --- | |
| with gr.TabItem("🔍 Repo Explorer", id="repo_explorer_tab"): | |
| repo_components, repo_states = create_repo_explorer_tab() | |
| # --- Footer --- | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center; padding: 30px 20px; margin-top: 40px; background: rgba(255, 255, 255, 0.1); border-radius: 16px; backdrop-filter: blur(10px);"> | |
| <p style="margin: 0; color: rgba(255, 255, 255, 0.8); font-size: 0.95rem; font-weight: 500;"> | |
| 🚀 Powered by <span style="background: linear-gradient(45deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 700;">Gradio</span> | |
| & <span style="background: linear-gradient(45deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 700;">Hugging Face</span> | |
| </p> | |
| <div style="height: 2px; width: 60px; background: linear-gradient(45deg, #667eea, #764ba2); margin: 16px auto; border-radius: 1px;"></div> | |
| </div> | |
| """ | |
| ) | |
| # --- Event Handler Functions --- | |
| def handle_smart_input(text: str, auto_analyze: bool) -> Tuple[List[str], int, pd.DataFrame, str, Any, str]: | |
| """Smart input handler that detects if input is repo IDs or keywords and processes accordingly.""" | |
| if not text.strip(): | |
| return [], 0, pd.DataFrame(), "Status: Please enter repository IDs or keywords.", gr.update(selected="input_tab"), "" | |
| # Determine input type | |
| if is_repo_id_format(text): | |
| # Process as repository IDs | |
| repo_ids = list(dict.fromkeys([repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()])) | |
| write_repos_to_csv(repo_ids) | |
| df = format_dataframe_for_display(read_csv_to_dataframe()) | |
| status = f"✅ Found {len(repo_ids)} repository IDs. " | |
| if auto_analyze: | |
| status += "Starting automatic analysis..." | |
| return repo_ids, 0, df, status, gr.update(selected="analysis_tab"), "auto_analyze" | |
| else: | |
| status += "Ready for manual analysis." | |
| return repo_ids, 0, df, status, gr.update(selected="analysis_tab"), "" | |
| else: | |
| # Process as keywords | |
| keyword_list = [k.strip() for k in re.split(r'[\n,]+', text) if k.strip()] | |
| repo_ids = [] | |
| for kw in keyword_list: | |
| repo_ids.extend(search_top_spaces(kw, limit=5)) | |
| unique_repo_ids = list(dict.fromkeys(repo_ids)) | |
| write_repos_to_csv(unique_repo_ids) | |
| df = format_dataframe_for_display(read_csv_to_dataframe()) | |
| status = f"🔍 Found {len(unique_repo_ids)} repositories from keywords. " | |
| if auto_analyze: | |
| status += "Starting automatic analysis..." | |
| return unique_repo_ids, 0, df, status, gr.update(selected="analysis_tab"), "auto_analyze" | |
| else: | |
| status += "Ready for manual analysis." | |
| return unique_repo_ids, 0, df, status, gr.update(selected="analysis_tab"), "" | |
| def handle_auto_analyze_toggle(auto_analyze: bool) -> Any: | |
| """Show/hide manual analysis controls based on auto-analyze setting.""" | |
| return gr.update(visible=not auto_analyze) | |
| def handle_user_message(user_message: str, history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str]: | |
| """Appends the user's message to the history, preparing for the bot's response.""" | |
| # Initialize chatbot with welcome message if empty | |
| if not history: | |
| history = [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}] | |
| if user_message: | |
| history.append({"role": "user", "content": user_message}) | |
| return history, "" | |
| def handle_bot_response(history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str, str, str, List[str], int, pd.DataFrame, Any]: | |
| """Generates bot response and automatically extracts keywords if conversation is ready.""" | |
| if not history or history[-1]["role"] != "user": | |
| return history, "", "", "", [], 0, pd.DataFrame(), gr.update() | |
| user_message = history[-1]["content"] | |
| # Convert all messages *before* the last user message into tuples for the API | |
| tuple_history_for_api = convert_messages_to_tuples(history[:-1]) | |
| response = chat_with_user(user_message, tuple_history_for_api) | |
| history.append({"role": "assistant", "content": response}) | |
| # Check if we should auto-extract keywords and search | |
| if should_auto_extract_keywords(history): | |
| # Auto-extract keywords | |
| tuple_history = convert_messages_to_tuples(history) | |
| raw_keywords_str = extract_keywords_from_conversation(tuple_history) | |
| # Sanitize keywords | |
| cleaned_keywords = re.findall(r'[\w\s-]+', raw_keywords_str) | |
| cleaned_keywords = [kw.strip() for kw in cleaned_keywords if kw.strip()] | |
| if cleaned_keywords: | |
| final_keywords_str = ", ".join(cleaned_keywords) | |
| # Extract user requirements | |
| user_requirements = extract_user_requirements_from_chat(history) | |
| # Auto-search repositories | |
| repo_ids = [] | |
| for kw in cleaned_keywords[:3]: # Use top 3 keywords to avoid too many results | |
| repo_ids.extend(search_top_spaces(kw, limit=5)) | |
| unique_repo_ids = list(dict.fromkeys(repo_ids)) | |
| write_repos_to_csv(unique_repo_ids) | |
| df = format_dataframe_for_display(read_csv_to_dataframe()) | |
| chat_status = f"🎯 Auto-extracted keywords and found {len(unique_repo_ids)} repositories. Analysis starting automatically..." | |
| return history, chat_status, final_keywords_str, user_requirements, unique_repo_ids, 0, df, gr.update(selected="analysis_tab") | |
| return history, "💬 Conversation continuing...", "", "", [], 0, pd.DataFrame(), gr.update() | |
| def handle_dataframe_select(evt: gr.SelectData, df_data) -> Tuple[str, Any, str]: | |
| """Handle dataframe row selection - show modal for repo ID (column 0) clicks.""" | |
| print(f"DEBUG: Selection event triggered!") | |
| print(f"DEBUG: evt = {evt}") | |
| print(f"DEBUG: df_data type = {type(df_data)}") | |
| if evt is None: | |
| return "", gr.update(visible=False), "" | |
| try: | |
| # Get the selected row and column from the event | |
| row_idx = evt.index[0] | |
| col_idx = evt.index[1] | |
| print(f"DEBUG: Selected row {row_idx}, column {col_idx}") | |
| # Handle pandas DataFrame | |
| if isinstance(df_data, pd.DataFrame) and not df_data.empty and row_idx < len(df_data): | |
| if col_idx == 0: # Repository name column - show action modal | |
| repo_id = df_data.iloc[row_idx, 0] | |
| print(f"DEBUG: Extracted repo_id = '{repo_id}'") | |
| if repo_id and str(repo_id).strip() and str(repo_id).strip() != 'nan': | |
| clean_repo_id = str(repo_id).strip() | |
| logger.info(f"Showing modal for repository: {clean_repo_id}") | |
| return clean_repo_id, gr.update(visible=True), clean_repo_id | |
| # For content columns (1,2,3) and relevance (4), do nothing since full text is shown directly | |
| else: | |
| print(f"DEBUG: Clicked on column {col_idx}, full text already shown in table") | |
| return "", gr.update(visible=False), "" | |
| else: | |
| print(f"DEBUG: df_data is not a DataFrame or row_idx {row_idx} out of range") | |
| except Exception as e: | |
| print(f"DEBUG: Exception occurred: {e}") | |
| logger.error(f"Error handling dataframe selection: {e}") | |
| return "", gr.update(visible=False), "" | |
| def handle_visit_repo(repo_id: str) -> Tuple[Any, str]: | |
| """Handle visiting the Hugging Face Space for the repository.""" | |
| if repo_id and repo_id.strip(): | |
| hf_url = f"https://huggingface.co/spaces/{repo_id.strip()}" | |
| logger.info(f"User chose to visit: {hf_url}") | |
| return gr.update(visible=False), hf_url | |
| return gr.update(visible=False), "" | |
| def handle_explore_repo(selected_repo_id: str) -> Tuple[Any, Any, Any, str, str]: | |
| """Handle navigating to the repo explorer and automatically load the repository.""" | |
| logger.info(f"DEBUG: handle_explore_repo called with selected_repo_id: '{selected_repo_id}'") | |
| if selected_repo_id and selected_repo_id.strip() and selected_repo_id.strip() != 'nan': | |
| clean_repo_id = selected_repo_id.strip() | |
| return ( | |
| gr.update(visible=False), # close modal | |
| gr.update(selected="repo_explorer_tab"), # switch tab | |
| gr.update(value=clean_repo_id), # populate repo explorer input | |
| clean_repo_id, # trigger repository loading with the repo ID | |
| "auto_load" # signal to auto-load the repository | |
| ) | |
| else: | |
| return ( | |
| gr.update(visible=False), # close modal | |
| gr.update(selected="repo_explorer_tab"), # switch tab | |
| gr.update(), # don't change repo explorer input | |
| "", # no repo ID to load | |
| "" # no auto-load signal | |
| ) | |
| def handle_cancel_modal() -> Any: | |
| """Handle closing the modal.""" | |
| return gr.update(visible=False) | |
| def generate_repo_links_html(df: pd.DataFrame) -> str: | |
| """Generate HTML with clickable links for repositories.""" | |
| if df.empty: | |
| return "" | |
| html_links = [] | |
| for idx, row in df.iterrows(): | |
| repo_id = row.get('repo id', '') if hasattr(row, 'get') else row[0] | |
| if repo_id and str(repo_id).strip() and str(repo_id).strip() != 'nan': | |
| clean_repo_id = str(repo_id).strip() | |
| hf_url = f"https://huggingface.co/spaces/{clean_repo_id}" | |
| html_links.append(f'<a href="{hf_url}" target="_blank" style="display: inline-block; margin: 5px 10px; padding: 8px 16px; background: linear-gradient(45deg, #667eea, #764ba2); color: white; text-decoration: none; border-radius: 8px; font-weight: 600; transition: all 0.3s ease;">{clean_repo_id}</a>') | |
| if html_links: | |
| return f'<div style="margin: 10px 0; padding: 15px; background: rgba(255, 255, 255, 0.1); border-radius: 12px; backdrop-filter: blur(10px);">{"".join(html_links)}</div>' | |
| return "" | |
| def handle_extract_and_analyze(history: List[Dict[str, str]]) -> Tuple[str, str, str, List[str], int, pd.DataFrame, Any, pd.DataFrame, str, Any, str, str]: | |
| """Extract keywords from chat, search repositories, and immediately start analysis.""" | |
| if not history: | |
| return "❌ No conversation to extract from.", "", "", [], 0, pd.DataFrame(), gr.update(), pd.DataFrame(), "", gr.update(visible=False), "", "" | |
| # Convert the full, valid history for the extraction logic | |
| tuple_history = convert_messages_to_tuples(history) | |
| if not tuple_history: | |
| return "❌ No completed conversations to analyze.", "", "", [], 0, pd.DataFrame(), gr.update(), pd.DataFrame(), "", gr.update(visible=False), "", "" | |
| # Get raw keywords string from the LLM | |
| raw_keywords_str = extract_keywords_from_conversation(tuple_history) | |
| # Sanitize the LLM output to extract only keyword-like parts | |
| cleaned_keywords = re.findall(r'[\w\s-]+', raw_keywords_str) | |
| cleaned_keywords = [kw.strip() for kw in cleaned_keywords if kw.strip()] | |
| if not cleaned_keywords: | |
| return f"❌ Could not extract valid keywords. Raw output: '{raw_keywords_str}'", "", "", [], 0, pd.DataFrame(), gr.update(), pd.DataFrame(), "", gr.update(visible=False), "", "" | |
| # Join them into a clean, comma-separated string | |
| final_keywords_str = ", ".join(cleaned_keywords) | |
| # Extract user requirements for analysis | |
| user_requirements = extract_user_requirements_from_chat(history) | |
| # Auto-search repositories | |
| repo_ids = [] | |
| for kw in cleaned_keywords[:3]: # Use top 3 keywords to avoid too many results | |
| repo_ids.extend(search_top_spaces(kw, limit=5)) | |
| unique_repo_ids = list(dict.fromkeys(repo_ids)) | |
| if not unique_repo_ids: | |
| return f"❌ No repositories found for keywords: {final_keywords_str}", final_keywords_str, user_requirements, [], 0, pd.DataFrame(), gr.update(), pd.DataFrame(), "", gr.update(visible=False), "", "" | |
| write_repos_to_csv(unique_repo_ids) | |
| df = format_dataframe_for_display(read_csv_to_dataframe()) | |
| # Immediately start analysis | |
| try: | |
| analyzed_df, analysis_status, top_repos, top_section_update, all_links, top_links = handle_analyze_all_repos(unique_repo_ids, user_requirements) | |
| chat_status = f"🎉 Extracted keywords → Found {len(unique_repo_ids)} repositories → Analysis complete!" | |
| return chat_status, final_keywords_str, user_requirements, unique_repo_ids, 0, analyzed_df, gr.update(selected="analysis_tab"), top_repos, analysis_status, top_section_update, all_links, top_links | |
| except Exception as e: | |
| logger.error(f"Error during extract and analyze: {e}") | |
| error_status = f"✅ Found {len(unique_repo_ids)} repositories, but analysis failed: {e}" | |
| return error_status, final_keywords_str, user_requirements, unique_repo_ids, 0, df, gr.update(selected="analysis_tab"), pd.DataFrame(), "", gr.update(visible=False), "", "" | |
| def extract_user_requirements_from_chat(history: List[Dict[str, str]]) -> str: | |
| """Extract user requirements from chatbot conversation.""" | |
| if not history: | |
| return "" | |
| user_messages = [] | |
| for msg in history: | |
| if msg.get('role') == 'user': | |
| user_messages.append(msg.get('content', '')) | |
| if not user_messages: | |
| return "" | |
| # Combine all user messages as requirements | |
| requirements = "\n".join([f"- {msg}" for msg in user_messages if msg.strip()]) | |
| return requirements | |
| def handle_analyze_all_repos(repo_ids: List[str], user_requirements: str, progress=gr.Progress()) -> Tuple[pd.DataFrame, str, pd.DataFrame, Any, str, str]: | |
| """Analyzes all repositories in the CSV file with progress tracking.""" | |
| if not repo_ids: | |
| return pd.DataFrame(), "Status: No repositories to analyze. Please submit repo IDs first.", pd.DataFrame(), gr.update(visible=False), "", "" | |
| total_repos = len(repo_ids) | |
| try: | |
| # Start the progress tracking | |
| progress(0, desc="Initializing batch analysis...") | |
| successful_analyses = 0 | |
| failed_analyses = 0 | |
| csv_update_failures = 0 | |
| for i, repo_id in enumerate(repo_ids): | |
| # Update progress | |
| progress_percent = (i / total_repos) | |
| progress(progress_percent, desc=f"Analyzing {repo_id} ({i+1}/{total_repos})") | |
| try: | |
| logger.info(f"Batch analysis: Processing {repo_id} ({i+1}/{total_repos})") | |
| # Analyze the repository | |
| content, summary, df = analyze_and_update_single_repo(repo_id, user_requirements) | |
| # Verify the CSV was actually updated by checking if the repo has analysis data | |
| updated_df = read_csv_to_dataframe() | |
| repo_updated = False | |
| for idx, row in updated_df.iterrows(): | |
| if row["repo id"] == repo_id: | |
| # Check if any analysis field is populated | |
| if (row.get("strength", "").strip() or | |
| row.get("weaknesses", "").strip() or | |
| row.get("speciality", "").strip() or | |
| row.get("relevance rating", "").strip()): | |
| repo_updated = True | |
| break | |
| if repo_updated: | |
| successful_analyses += 1 | |
| else: | |
| # CSV update failed - try once more | |
| logger.warning(f"CSV update failed for {repo_id}, attempting retry...") | |
| time.sleep(0.5) # Wait a bit longer | |
| # Force re-read and re-update | |
| df_retry = read_csv_to_dataframe() | |
| retry_success = False | |
| # Re-parse the analysis if available | |
| if summary and "JSON extraction: SUCCESS" in summary: | |
| # Extract the analysis from summary - this is a fallback | |
| logger.info(f"Attempting to re-update CSV for {repo_id}") | |
| content_retry, summary_retry, df_retry = analyze_and_update_single_repo(repo_id, user_requirements) | |
| # Check again | |
| final_df = read_csv_to_dataframe() | |
| for idx, row in final_df.iterrows(): | |
| if row["repo id"] == repo_id: | |
| if (row.get("strength", "").strip() or | |
| row.get("weaknesses", "").strip() or | |
| row.get("speciality", "").strip() or | |
| row.get("relevance rating", "").strip()): | |
| retry_success = True | |
| break | |
| if retry_success: | |
| successful_analyses += 1 | |
| else: | |
| csv_update_failures += 1 | |
| # Longer delay to prevent file conflicts | |
| time.sleep(0.3) | |
| except Exception as e: | |
| logger.error(f"Error analyzing {repo_id}: {e}") | |
| failed_analyses += 1 | |
| # Still wait to prevent rapid failures | |
| time.sleep(0.2) | |
| # Complete the progress | |
| progress(1.0, desc="Batch analysis completed!") | |
| # Get final updated dataframe | |
| updated_df = read_csv_to_dataframe() | |
| # Filter out rows with no analysis data for consistent display with top 3 | |
| analyzed_df = updated_df.copy() | |
| analyzed_df = analyzed_df[ | |
| (analyzed_df['strength'].str.strip() != '') | | |
| (analyzed_df['weaknesses'].str.strip() != '') | | |
| (analyzed_df['speciality'].str.strip() != '') | | |
| (analyzed_df['relevance rating'].str.strip() != '') | |
| ] | |
| # Get top 3 most relevant repositories using full data | |
| top_repos = get_top_relevant_repos(updated_df, user_requirements, top_n=3) | |
| # Generate HTML links for repositories | |
| all_links_html = generate_repo_links_html(analyzed_df) | |
| top_links_html = generate_repo_links_html(top_repos) if not top_repos.empty else "" | |
| # Final status with detailed breakdown | |
| final_status = f"🎉 Batch Analysis Complete!\n✅ Successful: {successful_analyses}/{total_repos}\n❌ Failed: {failed_analyses}/{total_repos}" | |
| if csv_update_failures > 0: | |
| final_status += f"\n⚠️ CSV Update Issues: {csv_update_failures}/{total_repos}" | |
| # Add top repos info if available | |
| if not top_repos.empty: | |
| final_status += f"\n\n🏆 Top {len(top_repos)} most relevant repositories selected!" | |
| # Show top repos section if we have results | |
| show_top_section = gr.update(visible=not top_repos.empty) | |
| logger.info(f"Batch analysis completed: {successful_analyses} successful, {failed_analyses} failed, {csv_update_failures} CSV update issues") | |
| return format_dataframe_for_display(analyzed_df), final_status, format_dataframe_for_display(top_repos), show_top_section, all_links_html, top_links_html | |
| except Exception as e: | |
| logger.error(f"Error in batch analysis: {e}") | |
| error_status = f"❌ Batch analysis failed: {e}" | |
| return format_dataframe_for_display(read_csv_to_dataframe()), error_status, pd.DataFrame(), gr.update(visible=False), "", "" | |
| def handle_reset_everything() -> Tuple[List[str], int, str, pd.DataFrame, pd.DataFrame, Any, List[Dict[str, str]], str, str, str]: | |
| """Reset everything to initial state - clear all data, CSV, and UI components.""" | |
| try: | |
| # Clear the CSV file | |
| if os.path.exists(CSV_FILE): | |
| os.remove(CSV_FILE) | |
| logger.info("CSV file deleted for reset") | |
| # Create empty dataframe | |
| empty_df = pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| # Reset state variables | |
| repo_ids_reset = [] | |
| current_idx_reset = 0 | |
| user_requirements_reset = "" | |
| # Reset status | |
| status_reset = "Status: Everything has been reset. Ready to start fresh!" | |
| # Reset UI components | |
| current_requirements_reset = "No requirements extracted yet." | |
| extracted_keywords_reset = "" | |
| # Reset chatbot to initial message | |
| chatbot_reset = [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}] | |
| logger.info("Complete system reset performed") | |
| return ( | |
| repo_ids_reset, # repo_ids_state | |
| current_idx_reset, # current_repo_idx_state | |
| user_requirements_reset, # user_requirements_state | |
| empty_df, # df_output | |
| empty_df, # top_repos_df | |
| gr.update(visible=False), # top_repos_section | |
| chatbot_reset, # chatbot | |
| status_reset, # status_box_input | |
| current_requirements_reset, # current_requirements_display | |
| extracted_keywords_reset # extracted_keywords_output | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error during reset: {e}") | |
| error_status = f"Reset failed: {e}" | |
| return ( | |
| [], # repo_ids_state | |
| 0, # current_repo_idx_state | |
| "", # user_requirements_state | |
| pd.DataFrame(), # df_output | |
| pd.DataFrame(), # top_repos_df | |
| gr.update(visible=False), # top_repos_section | |
| [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}], # chatbot | |
| error_status, # status_box_input | |
| "No requirements extracted yet.", # current_requirements_display | |
| "" # extracted_keywords_output | |
| ) | |
| # --- Component Event Wiring --- | |
| # Initialize chatbot with welcome message on app load | |
| app.load( | |
| fn=lambda: [{"role": "assistant", "content": CHATBOT_INITIAL_MESSAGE}], | |
| outputs=[chatbot] | |
| ) | |
| # Smart Input with Auto-processing | |
| smart_input.submit( | |
| fn=handle_smart_input, | |
| inputs=[smart_input, auto_analyze_checkbox], | |
| outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_input, tabs, status_box_input] | |
| ).then( | |
| # If auto_analyze is enabled and we got repos, start analysis automatically | |
| fn=lambda repo_ids, user_reqs, trigger: handle_analyze_all_repos(repo_ids, user_reqs) if trigger == "auto_analyze" and repo_ids else (pd.DataFrame(), "Ready for analysis.", pd.DataFrame(), gr.update(visible=False), "", ""), | |
| inputs=[repo_ids_state, user_requirements_state, status_box_input], | |
| outputs=[df_output, status_box_input, top_repos_df, top_repos_section, all_repo_links, top_repo_links] | |
| ) | |
| # Smart Submit Button (same behavior as enter) | |
| smart_submit_btn.click( | |
| fn=handle_smart_input, | |
| inputs=[smart_input, auto_analyze_checkbox], | |
| outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_input, tabs, status_box_input] | |
| ).then( | |
| # If auto_analyze is enabled and we got repos, start analysis automatically | |
| fn=lambda repo_ids, user_reqs, trigger: handle_analyze_all_repos(repo_ids, user_reqs) if trigger == "auto_analyze" and repo_ids else (pd.DataFrame(), "Ready for analysis.", pd.DataFrame(), gr.update(visible=False), "", ""), | |
| inputs=[repo_ids_state, user_requirements_state, status_box_input], | |
| outputs=[df_output, status_box_input, top_repos_df, top_repos_section, all_repo_links, top_repo_links] | |
| ) | |
| # Auto-analyze checkbox toggle | |
| auto_analyze_checkbox.change( | |
| fn=handle_auto_analyze_toggle, | |
| inputs=[auto_analyze_checkbox], | |
| outputs=[manual_analysis_row] | |
| ) | |
| # Manual analysis button (when auto-analyze is disabled) | |
| analyze_all_btn.click( | |
| fn=handle_analyze_all_repos, | |
| inputs=[repo_ids_state, user_requirements_state], | |
| outputs=[df_output, status_box_analysis, top_repos_df, top_repos_section, all_repo_links, top_repo_links] | |
| ) | |
| # Chatbot with Auto-extraction and Auto-search | |
| msg_input.submit( | |
| fn=handle_user_message, | |
| inputs=[msg_input, chatbot], | |
| outputs=[chatbot, msg_input] | |
| ).then( | |
| fn=handle_bot_response, | |
| inputs=[chatbot], | |
| outputs=[chatbot, chat_status, extracted_keywords_output, user_requirements_state, repo_ids_state, current_repo_idx_state, df_output, tabs] | |
| ).then( | |
| # Update requirements display when they change | |
| fn=lambda req: req if req.strip() else "No specific requirements extracted from conversation.", | |
| inputs=[user_requirements_state], | |
| outputs=[current_requirements_display] | |
| ).then( | |
| # If we got repos from chatbot, auto-analyze them | |
| fn=lambda repo_ids, user_reqs: handle_analyze_all_repos(repo_ids, user_reqs) if repo_ids else (pd.DataFrame(), "", pd.DataFrame(), gr.update(visible=False), "", ""), | |
| inputs=[repo_ids_state, user_requirements_state], | |
| outputs=[df_output, chat_status, top_repos_df, top_repos_section, all_repo_links, top_repo_links] | |
| ) | |
| send_btn.click( | |
| fn=handle_user_message, | |
| inputs=[msg_input, chatbot], | |
| outputs=[chatbot, msg_input] | |
| ).then( | |
| fn=handle_bot_response, | |
| inputs=[chatbot], | |
| outputs=[chatbot, chat_status, extracted_keywords_output, user_requirements_state, repo_ids_state, current_repo_idx_state, df_output, tabs] | |
| ).then( | |
| # Update requirements display when they change | |
| fn=lambda req: req if req.strip() else "No specific requirements extracted from conversation.", | |
| inputs=[user_requirements_state], | |
| outputs=[current_requirements_display] | |
| ).then( | |
| # If we got repos from chatbot, auto-analyze them | |
| fn=lambda repo_ids, user_reqs: handle_analyze_all_repos(repo_ids, user_reqs) if repo_ids else (pd.DataFrame(), "", pd.DataFrame(), gr.update(visible=False), "", ""), | |
| inputs=[repo_ids_state, user_requirements_state], | |
| outputs=[df_output, chat_status, top_repos_df, top_repos_section, all_repo_links, top_repo_links] | |
| ) | |
| # Extract and Analyze Button (one-click solution for chatbot) | |
| extract_analyze_btn.click( | |
| fn=handle_extract_and_analyze, | |
| inputs=[chatbot], | |
| outputs=[chat_status, extracted_keywords_output, user_requirements_state, repo_ids_state, current_repo_idx_state, df_output, tabs, top_repos_df, status_box_analysis, top_repos_section, all_repo_links, top_repo_links] | |
| ).then( | |
| # Update requirements display when they change | |
| fn=lambda req: req if req.strip() else "No specific requirements extracted from conversation.", | |
| inputs=[user_requirements_state], | |
| outputs=[current_requirements_display] | |
| ) | |
| # Repo Explorer Tab | |
| setup_repo_explorer_events(repo_components, repo_states) | |
| # Direct Repository Clicks - Show Modal (like old_app2.py) | |
| df_output.select( | |
| fn=handle_dataframe_select, | |
| inputs=[df_output], | |
| outputs=[selected_repo_display, repo_action_modal, selected_repo_id_state] | |
| ) | |
| top_repos_df.select( | |
| fn=handle_dataframe_select, | |
| inputs=[top_repos_df], | |
| outputs=[selected_repo_display, repo_action_modal, selected_repo_id_state] | |
| ) | |
| # Modal button events (like old_app2.py) | |
| visit_repo_btn.click( | |
| fn=handle_visit_repo, | |
| inputs=[selected_repo_display], | |
| outputs=[repo_action_modal, selected_repo_display], | |
| js="(repo_id) => { if(repo_id && repo_id.trim()) { window.open('https://huggingface.co/spaces/' + repo_id.trim(), '_blank'); } }" | |
| ) | |
| explore_repo_btn.click( | |
| fn=handle_explore_repo, | |
| inputs=[selected_repo_id_state], | |
| outputs=[ | |
| repo_action_modal, | |
| tabs, | |
| repo_components["repo_explorer_input"], | |
| repo_states["current_repo_id"], # Set the current repo ID | |
| status_box_input # Use for auto-load signal | |
| ], | |
| js="""(repo_id) => { | |
| console.log('DEBUG: Navigate to repo explorer for:', repo_id); | |
| setTimeout(() => { | |
| window.scrollTo({top: 0, behavior: 'smooth'}); | |
| }, 200); | |
| }""" | |
| ).then( | |
| # Auto-load the repository if the signal indicates to do so | |
| fn=lambda repo_id, signal: handle_load_repository(repo_id) if signal == "auto_load" and repo_id else ("", ""), | |
| inputs=[repo_states["current_repo_id"], status_box_input], | |
| outputs=[repo_components["repo_status_display"], repo_states["repo_context_summary"]] | |
| ).then( | |
| # Initialize the chatbot with welcome message after auto-loading | |
| fn=lambda repo_status, repo_id, repo_context, signal: ( | |
| initialize_repo_chatbot(repo_status, repo_id, repo_context) | |
| if signal == "auto_load" and repo_id else [] | |
| ), | |
| inputs=[repo_components["repo_status_display"], repo_states["current_repo_id"], repo_states["repo_context_summary"], status_box_input], | |
| outputs=[repo_components["repo_chatbot"]] | |
| ) | |
| cancel_modal_btn.click( | |
| fn=handle_cancel_modal, | |
| outputs=[repo_action_modal] | |
| ) | |
| # Reset button event | |
| reset_all_btn.click( | |
| fn=handle_reset_everything, | |
| outputs=[repo_ids_state, current_repo_idx_state, user_requirements_state, df_output, top_repos_df, top_repos_section, chatbot, status_box_input, current_requirements_display, extracted_keywords_output] | |
| ) | |
| # Help modal events | |
| help_btn.click( | |
| fn=lambda: gr.update(visible=True), | |
| outputs=[help_modal] | |
| ) | |
| close_help_btn.click( | |
| fn=lambda: gr.update(visible=False), | |
| outputs=[help_modal] | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| app = create_ui() | |
| app.launch(debug=True) | |