Spaces:

Agents-MCP-Hackathon
/

HF_RepoSense

Running

App Files Files Community

naman1102 commited on 15 days ago

Commit

274a509

1 Parent(s): f3ed537

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -205

app.py CHANGED Viewed

@@ -2,20 +2,19 @@ import gradio as gr
 import regex as re
 import csv
 import pandas as pd
-from typing import List, Dict, Tuple, Optional
 import logging
-from datetime import datetime
 import os
-from huggingface_hub import HfApi, SpaceCard
 from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
 from hf_utils import download_space_repo, search_top_spaces
 from chatbot_page import chat_with_user, extract_keywords_from_conversation
-# Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Constants
 CSV_FILE = "repo_ids.csv"
 CHATBOT_SYSTEM_PROMPT = (
     "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
@@ -23,268 +22,275 @@ CHATBOT_SYSTEM_PROMPT = (
     "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
     "Return only the keywords as a comma-separated list."
 )
-class AppState:
-    """State management for the application."""
-    def __init__(self):
-        self.repo_ids: List[str] = []
-        self.current_repo_idx: int = 0
-        self.generated_keywords: List[str] = []
-        self.chat_history: List[Dict[str, str]] = []
-def read_csv_as_text(filename: str) -> pd.DataFrame:
-    """Read CSV file and return as DataFrame."""
-    try:
-        return pd.read_csv(filename, dtype=str)
-    except Exception as e:
-        logger.error(f"Error reading CSV: {e}")
-        return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
 def write_repos_to_csv(repo_ids: List[str]) -> None:
-    """Write repository IDs to CSV file."""
     try:
-        with open(CSV_FILE, 'w', newline='', encoding="utf-8") as f:
-            writer = csv.writer(f)
             writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
             for repo_id in repo_ids:
                 writer.writerow([repo_id, "", "", "", ""])
     except Exception as e:
         logger.error(f"Error writing to CSV: {e}")
-def process_repo_input(text: str, state: AppState) -> pd.DataFrame:
-    """Process repository IDs input."""
-    if not text:
-        state.repo_ids = []
-        state.current_repo_idx = 0
-        return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
-    repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
-    state.repo_ids = repo_ids
-    state.current_repo_idx = 0
-    write_repos_to_csv(repo_ids)
-    return read_csv_as_text(CSV_FILE)
-def keyword_search_and_update(keyword: str, state: AppState) -> pd.DataFrame:
-    """Search for repositories by keywords."""
-    if not keyword:
         return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
-    keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()]
-    repo_ids = []
-    for kw in keyword_list:
-        repo_ids.extend(search_top_spaces(kw, limit=5))
-    # Remove duplicates while preserving order
-    seen = set()
-    unique_repo_ids = []
-    for rid in repo_ids:
-        if rid not in seen:
-            unique_repo_ids.append(rid)
-            seen.add(rid)
-    state.repo_ids = unique_repo_ids
-    state.current_repo_idx = 0
-    write_repos_to_csv(unique_repo_ids)
-    return read_csv_as_text(CSV_FILE)
-def analyze_single_repo(repo_id: str) -> Tuple[str, str, Dict]:
-    """Analyze a single repository."""
     try:
         download_space_repo(repo_id, local_dir="repo_files")
         txt_path = combine_repo_files_for_llm()
         with open(txt_path, "r", encoding="utf-8") as f:
             combined_content = f.read()
         llm_output = analyze_combined_file(txt_path)
         last_start = llm_output.rfind('{')
         last_end = llm_output.rfind('}')
-        final_json_str = llm_output[last_start:last_end+1] if last_start != -1 and last_end != -1 and last_end > last_start else llm_output
         llm_json = parse_llm_json_response(final_json_str)
         if isinstance(llm_json, dict) and "error" not in llm_json:
-            strengths = llm_json.get("strength", "")
-            weaknesses = llm_json.get("weaknesses", "")
             summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
         else:
-            summary = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}"
-        return combined_content, summary, llm_json
-    except Exception as e:
-        logger.error(f"Error analyzing repo {repo_id}: {e}")
-        return f"Error analyzing {repo_id}", f"Error: {str(e)}", {"error": str(e)}
-def update_csv_with_analysis(repo_id: str, analysis_results: Dict) -> pd.DataFrame:
-    """Update CSV file with analysis results."""
-    try:
-        df = read_csv_as_text(CSV_FILE)
-        updated = False
         for idx, row in df.iterrows():
             if row["repo id"] == repo_id:
-                if isinstance(analysis_results, dict) and "error" not in analysis_results:
-                    df.at[idx, "strength"] = analysis_results.get("strength", "")
-                    df.at[idx, "weaknesses"] = analysis_results.get("weaknesses", "")
-                    df.at[idx, "speciality"] = analysis_results.get("speciality", "")
-                    df.at[idx, "relevance rating"] = analysis_results.get("relevance rating", "")
-                updated = True
                 break
-        if not updated and isinstance(analysis_results, dict) and "error" not in analysis_results:
-            new_row = {
-                "repo id": repo_id,
-                "strength": analysis_results.get("strength", ""),
-                "weaknesses": analysis_results.get("weaknesses", ""),
-                "speciality": analysis_results.get("speciality", ""),
-                "relevance rating": analysis_results.get("relevance rating", "")
-            }
-            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
         df.to_csv(CSV_FILE, index=False)
-        return df
     except Exception as e:
-        logger.error(f"Error updating CSV: {e}")
-        return read_csv_as_text(CSV_FILE)
-def show_combined_repo_and_llm(state: AppState) -> Tuple[str, str, pd.DataFrame]:
-    """Show combined repo content and LLM analysis."""
-    if not state.repo_ids:
-        return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame()
-    if state.current_repo_idx >= len(state.repo_ids):
-        return "All repo IDs have been processed.", "", read_csv_as_text(CSV_FILE)
-    repo_id = state.repo_ids[state.current_repo_idx]
-    combined_content, summary, analysis_results = analyze_single_repo(repo_id)
-    df = update_csv_with_analysis(repo_id, analysis_results)
-    state.current_repo_idx += 1
-    return combined_content, summary, df
 def create_ui() -> gr.Blocks:
-    """Create the Gradio interface."""
-    state = gr.State(AppState())
-    with gr.Blocks(title="Hugging Face Repo Analyzer", theme=gr.themes.Soft()) as app:
-        gr.Markdown("# Hugging Face Repository Analyzer")
-        with gr.Row():
-            with gr.Column():
-                # Input Section
-                gr.Markdown("### Enter Repository IDs")
-                repo_id_input = gr.Textbox(
-                    label="Enter repo IDs (comma or newline separated)",
-                    lines=5,
-                    placeholder="repo1, repo2\nrepo3"
-                )
-                submit_btn = gr.Button("Submit Repository IDs", variant="primary")
-                gr.Markdown("### Or Search by Keywords")
-                keyword_input = gr.Textbox(
-                    label="Enter keywords to search",
-                    lines=3,
-                    placeholder="Enter keywords separated by commas"
-                )
-                search_btn = gr.Button("Search by Keywords", variant="primary")
-                status = gr.Textbox(label="Status", visible=True)
-                # Results Section
-                df_output = gr.Dataframe(
-                    headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating"],
-                    datatype=["str", "str", "str", "str", "str"]
-                )
-                # Analysis Section
-                content_output = gr.Textbox(label="Repository Content", lines=10)
-                summary_output = gr.Textbox(label="Analysis Summary", lines=5)
                 with gr.Row():
-                    analyze_btn = gr.Button("Analyze Next Repository", variant="primary")
-                    finish_btn = gr.Button("Finish Analysis", variant="secondary")
-                # Chat Section
                 chatbot = gr.Chatbot(
                     label="Chat with Assistant",
                     height=400,
                     type="messages"
                 )
-                msg = gr.Textbox(label="Message", placeholder="Ask about the repository...")
                 with gr.Row():
                     send_btn = gr.Button("Send", variant="primary")
-                    end_chat_btn = gr.Button("End Chat", variant="secondary")
-        def process_repo_input_with_status(text: str, state: AppState) -> Tuple[pd.DataFrame, str]:
-            """Process repo input with status update."""
-            df = process_repo_input(text, state)
-            return df, f"Found {len(state.repo_ids)} repositories"
-        def keyword_search_with_status(keyword: str, state: AppState) -> Tuple[pd.DataFrame, str]:
-            """Search keywords with status update."""
-            df = keyword_search_and_update(keyword, state)
-            return df, f"Found {len(state.repo_ids)} repositories"
-        def analyze_with_status(state: AppState) -> Tuple[str, str, pd.DataFrame, str]:
-            """Analyze with status update."""
-            content, summary, df = show_combined_repo_and_llm(state)
-            return content, summary, df, f"Analyzing repository {state.current_repo_idx} of {len(state.repo_ids)}"
-        def send_message_with_status(message: str, history: List[Dict[str, str]], state: AppState) -> Tuple[List[Dict[str, str]], str]:
-            """Send message with status update."""
-            if not message:
-                return history, ""
-            history.append({"role": "user", "content": message})
-            response = chat_with_user(message, history, CHATBOT_SYSTEM_PROMPT)
-            history.append({"role": "assistant", "content": response})
             return history, ""
-        def end_chat_with_status(history: List[Dict[str, str]], state: AppState) -> Tuple[List[str], str]:
-            """End chat and extract keywords."""
             if not history:
-                return [], "No chat history to analyze"
-            keywords = extract_keywords_from_conversation(history)
-            state.generated_keywords = keywords
-            return keywords, "Keywords extracted from conversation"
-        # Event handlers
-        submit_btn.click(
-            fn=process_repo_input_with_status,
-            inputs=[repo_id_input, state],
-            outputs=[df_output, status]
         )
         search_btn.click(
-            fn=keyword_search_with_status,
-            inputs=[keyword_input, state],
-            outputs=[df_output, status]
         )
-        analyze_btn.click(
-            fn=analyze_with_status,
-            inputs=[state],
-            outputs=[content_output, summary_output, df_output, status]
         )
         send_btn.click(
-            fn=send_message_with_status,
-            inputs=[msg, chatbot, state],
-            outputs=[chatbot, msg]
         )
         end_chat_btn.click(
-            fn=end_chat_with_status,
-            inputs=[chatbot, state],
-            outputs=[gr.Textbox(label="Extracted Keywords"), status]
         )
     return app
 if __name__ == "__main__":
     app = create_ui()
-    app.launch()

 import regex as re
 import csv
 import pandas as pd
+from typing import List, Dict, Tuple, Any
 import logging
 import os
+# Import core logic from other modules, as in app_old.py
 from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
 from hf_utils import download_space_repo, search_top_spaces
 from chatbot_page import chat_with_user, extract_keywords_from_conversation
+# --- Configuration ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 CSV_FILE = "repo_ids.csv"
 CHATBOT_SYSTEM_PROMPT = (
     "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
     "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
     "Return only the keywords as a comma-separated list."
 )
+CHATBOT_INITIAL_MESSAGE = "Hello! Please tell me about your ideal Hugging Face repo. What use case, preferred language, or features are you looking for?"
+# --- Helper Functions (Logic) ---
 def write_repos_to_csv(repo_ids: List[str]) -> None:
+    """Writes a list of repo IDs to the CSV file, overwriting the previous content."""
     try:
+        with open(CSV_FILE, mode="w", newline='', encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile)
             writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
             for repo_id in repo_ids:
                 writer.writerow([repo_id, "", "", "", ""])
+        logger.info(f"Wrote {len(repo_ids)} repo IDs to {CSV_FILE}")
     except Exception as e:
         logger.error(f"Error writing to CSV: {e}")
+def read_csv_to_dataframe() -> pd.DataFrame:
+    """Reads the CSV file into a pandas DataFrame."""
+    try:
+        return pd.read_csv(CSV_FILE, dtype=str).fillna('')
+    except FileNotFoundError:
         return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
+    except Exception as e:
+        logger.error(f"Error reading CSV: {e}")
+        return pd.DataFrame()
+def analyze_and_update_single_repo(repo_id: str) -> Tuple[str, str, pd.DataFrame]:
+    """
+    Downloads, analyzes a single repo, updates the CSV, and returns results.
+    This function combines the logic of downloading, analyzing, and updating the CSV for one repo.
+    """
     try:
+        logger.info(f"Starting analysis for repo: {repo_id}")
         download_space_repo(repo_id, local_dir="repo_files")
         txt_path = combine_repo_files_for_llm()
         with open(txt_path, "r", encoding="utf-8") as f:
             combined_content = f.read()
         llm_output = analyze_combined_file(txt_path)
         last_start = llm_output.rfind('{')
         last_end = llm_output.rfind('}')
+        final_json_str = llm_output[last_start:last_end+1] if last_start != -1 and last_end != -1 else "{}"
         llm_json = parse_llm_json_response(final_json_str)
+        summary = ""
         if isinstance(llm_json, dict) and "error" not in llm_json:
+            strengths = llm_json.get("strength", "N/A")
+            weaknesses = llm_json.get("weaknesses", "N/A")
             summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
         else:
+            summary = f"JSON extraction: FAILED\nRaw response might not be valid JSON."
+        # Update CSV
+        df = read_csv_to_dataframe()
+        repo_found_in_df = False
         for idx, row in df.iterrows():
             if row["repo id"] == repo_id:
+                if isinstance(llm_json, dict):
+                    df.at[idx, "strength"] = llm_json.get("strength", "")
+                    df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "")
+                    df.at[idx, "speciality"] = llm_json.get("speciality", "")
+                    df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
+                repo_found_in_df = True
                 break
+        if not repo_found_in_df:
+             logger.warning(f"Repo ID {repo_id} not found in CSV for updating.")
         df.to_csv(CSV_FILE, index=False)
+        logger.info(f"Successfully analyzed and updated CSV for {repo_id}")
+        return combined_content, summary, df
     except Exception as e:
+        logger.error(f"An error occurred during analysis of {repo_id}: {e}")
+        error_summary = f"Error analyzing repo: {e}"
+        return "", error_summary, read_csv_to_dataframe()
+# --- Gradio UI ---
 def create_ui() -> gr.Blocks:
+    """Creates and configures the entire Gradio interface."""
+    with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Repo Analyzer") as app:
+        # --- State Management ---
+        # Using simple, separate state objects for robustness.
+        repo_ids_state = gr.State([])
+        current_repo_idx_state = gr.State(0)
+        gr.Markdown("# Hugging Face Repository Analyzer")
+        with gr.Tabs() as tabs:
+            # --- Input Tab ---
+            with gr.TabItem("1. Input Repositories", id="input_tab"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("## Enter Repository IDs")
+                        repo_id_input = gr.Textbox(
+                            label="Enter repo IDs (comma or newline separated)",
+                            lines=8,
+                            placeholder="org/repo1, org/repo2"
+                        )
+                        submit_repo_btn = gr.Button("Submit Repository IDs", variant="primary")
+                    with gr.Column():
+                        gr.Markdown("## Or Search by Keywords")
+                        keyword_input = gr.Textbox(
+                            label="Enter keywords to search",
+                            lines=8,
+                            placeholder="e.g., text generation, image classification"
+                        )
+                        search_btn = gr.Button("Search by Keywords", variant="primary")
+                status_box_input = gr.Textbox(label="Status", interactive=False)
+            # --- Analysis Tab ---
+            with gr.TabItem("2. Analyze Repositories", id="analysis_tab"):
+                gr.Markdown("## Repository Analysis")
+                analyze_next_btn = gr.Button("Analyze Next Repository", variant="primary")
+                status_box_analysis = gr.Textbox(label="Status", interactive=False)
                 with gr.Row():
+                    content_output = gr.Textbox(label="Repository Content", lines=20)
+                    summary_output = gr.Textbox(label="Analysis Summary", lines=20)
+                gr.Markdown("### Analysis Results Table")
+                df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
+            # --- Chatbot Tab ---
+            with gr.TabItem("3. Find Repos with AI", id="chatbot_tab"):
+                gr.Markdown("## Chat with an Assistant to Find Repositories")
                 chatbot = gr.Chatbot(
+                    value=[(None, CHATBOT_INITIAL_MESSAGE)],
                     label="Chat with Assistant",
                     height=400,
                     type="messages"
                 )
+                msg_input = gr.Textbox(label="Your Message", placeholder="Type your message here...", lines=2)
                 with gr.Row():
                     send_btn = gr.Button("Send", variant="primary")
+                    end_chat_btn = gr.Button("End Chat & Get Keywords")
+                gr.Markdown("### Extracted Keywords")
+                extracted_keywords_output = gr.Textbox(label="Keywords", interactive=False)
+                use_keywords_btn = gr.Button("Use These Keywords to Search", variant="primary")
+                status_box_chatbot = gr.Textbox(label="Status", interactive=False)
+        # --- Event Handler Functions ---
+        def handle_repo_id_submission(text: str) -> Tuple[List[str], int, pd.DataFrame, str, Any]:
+            """Processes submitted repo IDs, updates state, and prepares for analysis."""
+            if not text:
+                return [], 0, pd.DataFrame(), "Status: Please enter repository IDs.", gr.update(selected="input_tab")
+            repo_ids = list(dict.fromkeys([repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]))
+            write_repos_to_csv(repo_ids)
+            df = read_csv_to_dataframe()
+            status = f"Status: {len(repo_ids)} repositories submitted. Ready for analysis."
+            return repo_ids, 0, df, status, gr.update(selected="analysis_tab")
+        def handle_keyword_search(keywords: str) -> Tuple[List[str], int, pd.DataFrame, str, Any]:
+            """Processes submitted keywords, finds repos, updates state, and prepares for analysis."""
+            if not keywords:
+                return [], 0, pd.DataFrame(), "Status: Please enter keywords.", gr.update(selected="input_tab")
+            keyword_list = [k.strip() for k in re.split(r'[\n,]+', keywords) if k.strip()]
+            repo_ids = []
+            for kw in keyword_list:
+                repo_ids.extend(search_top_spaces(kw, limit=5))
+            unique_repo_ids = list(dict.fromkeys(repo_ids))
+            write_repos_to_csv(unique_repo_ids)
+            df = read_csv_to_dataframe()
+            status = f"Status: Found {len(unique_repo_ids)} repositories. Ready for analysis."
+            return unique_repo_ids, 0, df, status, gr.update(selected="analysis_tab")
+        def handle_analyze_next(repo_ids: List[str], current_idx: int) -> Tuple[str, str, pd.DataFrame, int, str]:
+            """Analyzes the next repository in the list."""
+            if not repo_ids:
+                return "", "", pd.DataFrame(), 0, "Status: No repositories to analyze. Please submit repo IDs first."
+            if current_idx >= len(repo_ids):
+                return "", "", read_csv_to_dataframe(), current_idx, "Status: All repositories have been analyzed."
+            repo_id_to_analyze = repo_ids[current_idx]
+            status = f"Status: Analyzing repository {current_idx + 1}/{len(repo_ids)}: {repo_id_to_analyze}"
+            content, summary, df = analyze_and_update_single_repo(repo_id_to_analyze)
+            next_idx = current_idx + 1
+            if next_idx >= len(repo_ids):
+                status += "\n\nFinished all analyses."
+            return content, summary, df, next_idx, status
+        def handle_user_message(user_message: str, history: List[List[str]]) -> Tuple[List[List[str]], str]:
+            """Handles sending a user message to the chatbot."""
+            history.append([user_message, None])
             return history, ""
+        def handle_bot_response(history: List[List[str]]) -> List[List[str]]:
+            """Generates and displays the bot's response."""
+            user_message = history[-1][0]
+            response = chat_with_user(user_message, history[:-1], CHATBOT_SYSTEM_PROMPT)
+            history[-1][1] = response
+            return history
+        def handle_end_chat(history: List[List[str]]) -> Tuple[str, str]:
+            """Ends the chat and extracts keywords from the conversation."""
             if not history:
+                return "", "Status: Chat is empty, nothing to analyze."
+            keywords_str = extract_keywords_from_conversation(history)
+            status = "Status: Keywords extracted. You can now use them to search."
+            return keywords_str, status
+        # --- Component Event Wiring ---
+        # Input Tab
+        submit_repo_btn.click(
+            fn=handle_repo_id_submission,
+            inputs=[repo_id_input],
+            outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_analysis, tabs]
         )
         search_btn.click(
+            fn=handle_keyword_search,
+            inputs=[keyword_input],
+            outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_analysis, tabs]
         )
+        # Analysis Tab
+        analyze_next_btn.click(
+            fn=handle_analyze_next,
+            inputs=[repo_ids_state, current_repo_idx_state],
+            outputs=[content_output, summary_output, df_output, current_repo_idx_state, status_box_analysis]
         )
+        # Chatbot Tab
+        msg_input.submit(
+            fn=handle_user_message,
+            inputs=[msg_input, chatbot],
+            outputs=[chatbot, msg_input]
+        ).then(
+            fn=handle_bot_response,
+            inputs=[chatbot],
+            outputs=[chatbot]
+        )
         send_btn.click(
+            fn=handle_user_message,
+            inputs=[msg_input, chatbot],
+            outputs=[chatbot, msg_input]
+        ).then(
+            fn=handle_bot_response,
+            inputs=[chatbot],
+            outputs=[chatbot]
         )
         end_chat_btn.click(
+            fn=handle_end_chat,
+            inputs=[chatbot],
+            outputs=[extracted_keywords_output, status_box_chatbot]
+        )
+        use_keywords_btn.click(
+            fn=handle_keyword_search,
+            inputs=[extracted_keywords_output],
+            outputs=[repo_ids_state, current_repo_idx_state, df_output, status_box_analysis, tabs]
         )
     return app
 if __name__ == "__main__":
     app = create_ui()
+    app.launch(debug=True)