import gradio as gr import regex as re import csv import pandas as pd from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response from hf_utils import download_space_repo, search_top_spaces from chatbot_page import chat_with_user, extract_keywords_from_conversation # Import chatbot logic from analyzer import analyze_code # Chatbot system prompt CHATBOT_SYSTEM_PROMPT = ( "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. " "Ask questions to clarify what they want, their use case, preferred language, features, etc. " "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. " "Return only the keywords as a comma-separated list." ) # Initial assistant message for chatbot CHATBOT_INITIAL_MESSAGE = "Hello! Please tell me about your ideal Hugging Face repo. What use case, preferred language, or features are you looking for?" def read_csv_as_text(csv_filename): return pd.read_csv(csv_filename, dtype=str) def process_repo_input(text): if not text: return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) # Split by newlines and commas, strip whitespace repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()] # Write to CSV csv_filename = "repo_ids.csv" with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: writer = csv.writer(csvfile) writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) for repo_id in repo_ids: writer.writerow([repo_id, "", "", "", ""]) # Read the CSV into a DataFrame to display df = read_csv_as_text(csv_filename) return df # Store the last entered repo ids and the current index in global variables for button access last_repo_ids = [] current_repo_idx = 0 # Store extracted keywords for the chatbot flow generated_keywords = [] def process_repo_input_and_store(text): global last_repo_ids, current_repo_idx if not text: last_repo_ids = [] current_repo_idx = 0 return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()] last_repo_ids = repo_ids current_repo_idx = 0 csv_filename = "repo_ids.csv" with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: writer = csv.writer(csvfile) writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) for repo_id in repo_ids: writer.writerow([repo_id, "", "", "", ""]) df = read_csv_as_text(csv_filename) return df def keyword_search_and_update(keyword): global last_repo_ids, current_repo_idx if not keyword: return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) # Accept multiple keywords, comma or newline separated keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()] repo_ids = [] for kw in keyword_list: repo_ids.extend(search_top_spaces(kw, limit=5)) # Remove duplicates while preserving order seen = set() unique_repo_ids = [] for rid in repo_ids: if rid not in seen: unique_repo_ids.append(rid) seen.add(rid) last_repo_ids = unique_repo_ids current_repo_idx = 0 csv_filename = "repo_ids.csv" with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: writer = csv.writer(csvfile) writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) for repo_id in unique_repo_ids: writer.writerow([repo_id, "", "", "", ""]) df = read_csv_as_text(csv_filename) return df def show_combined_repo_and_llm(): global current_repo_idx if not last_repo_ids: return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame() if current_repo_idx >= len(last_repo_ids): return "All repo IDs have been processed.", "", read_csv_as_text("repo_ids.csv") repo_id = last_repo_ids[current_repo_idx] try: download_space_repo(repo_id, local_dir="repo_files") except Exception as e: return f"Error downloading repo: {e}", "", read_csv_as_text("repo_ids.csv") txt_path = combine_repo_files_for_llm() try: with open(txt_path, "r", encoding="utf-8") as f: combined_content = f.read() except Exception as e: return f"Error reading {txt_path}: {e}", "", read_csv_as_text("repo_ids.csv") llm_output = analyze_combined_file(txt_path) # Extract only the last JSON object (final summary) for CSV writing last_start = llm_output.rfind('{') last_end = llm_output.rfind('}') if last_start != -1 and last_end != -1 and last_end > last_start: final_json_str = llm_output[last_start:last_end+1] else: final_json_str = llm_output llm_json = parse_llm_json_response(final_json_str) # Update CSV for the current repo id csv_filename = "repo_ids.csv" extraction_status = "" strengths = "" weaknesses = "" try: df = read_csv_as_text(csv_filename) for col in ["strength", "weaknesses", "speciality", "relevance rating"]: df[col] = df[col].astype(str) updated = False for idx, row in df.iterrows(): if row["repo id"] == repo_id: if isinstance(llm_json, dict) and "error" not in llm_json: extraction_status = "JSON extraction: SUCCESS" strengths = llm_json.get("strength", "") weaknesses = llm_json.get("weaknesses", "") df.at[idx, "strength"] = strengths df.at[idx, "weaknesses"] = weaknesses df.at[idx, "speciality"] = llm_json.get("speciality", "") df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "") updated = True else: extraction_status = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}" break # If not updated (repo_id not found), append a new row if not updated and isinstance(llm_json, dict) and "error" not in llm_json: extraction_status = "JSON extraction: SUCCESS (new row)" strengths = llm_json.get("strength", "") weaknesses = llm_json.get("weaknesses", "") new_row = { "repo id": repo_id, "strength": strengths, "weaknesses": weaknesses, "speciality": llm_json.get("speciality", ""), "relevance rating": llm_json.get("relevance rating", "") } df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) df.to_csv(csv_filename, index=False) except Exception as e: df = read_csv_as_text(csv_filename) extraction_status = f"CSV update error: {e}" # Move to next repo for next click current_repo_idx += 1 summary = f"{extraction_status}\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}" return combined_content, summary, df def go_to_analysis(): return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) def go_to_input(): return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) def go_to_chatbot(): return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) def go_to_start(): return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) def go_to_results(): return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) repo_id_input = gr.Textbox(label="Enter repo IDs (comma or newline separated)", lines=5, placeholder="repo1, repo2\nrepo3") df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], datatype=["str", "str", "str", "str", "str", "str"] ) def use_keywords_to_search_and_update_csv(keywords): global last_repo_ids, current_repo_idx if not keywords: return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) # Split keywords and search for each keyword_list = [k.strip() for k in keywords.split(",") if k.strip()] repo_ids = [] for kw in keyword_list: repo_ids.extend(search_top_spaces(kw, limit=3)) # limit=3 per keyword # Remove duplicates while preserving order seen = set() unique_repo_ids = [] for rid in repo_ids: if rid not in seen: unique_repo_ids.append(rid) seen.add(rid) last_repo_ids = unique_repo_ids current_repo_idx = 0 csv_filename = "repo_ids.csv" with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: writer = csv.writer(csvfile) writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) for repo_id in unique_repo_ids: writer.writerow([repo_id, "", "", "", ""]) df = read_csv_as_text(csv_filename) return df def batch_analyze_and_select_top(): csv_filename = "repo_ids.csv" try: df = read_csv_as_text(csv_filename) all_infos = [] # Analyze each repo and update CSV for idx, row in df.iterrows(): repo_id = row["repo id"] try: download_space_repo(repo_id, local_dir="repo_files") txt_path = combine_repo_files_for_llm() llm_output = analyze_combined_file(txt_path) last_start = llm_output.rfind('{') last_end = llm_output.rfind('}') if last_start != -1 and last_end != -1 and last_end > last_start: final_json_str = llm_output[last_start:last_end+1] else: final_json_str = llm_output llm_json = parse_llm_json_response(final_json_str) if isinstance(llm_json, dict) and "error" not in llm_json: df.at[idx, "strength"] = llm_json.get("strength", "") df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "") df.at[idx, "speciality"] = llm_json.get("speciality", "") df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "") all_infos.append({"repo id": repo_id, **llm_json}) except Exception as e: all_infos.append({"repo id": repo_id, "error": str(e)}) df.to_csv(csv_filename, index=False) # Display all info all_info_str = "\n\n".join([str(info) for info in all_infos]) # Let LLM choose the best 3 from openai import OpenAI import os client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") selection_prompt = ( "You are a helpful assistant. You are given a list of repo analyses in JSON format. " "Choose the 3 repos that are the most impressive, relevant, or useful. " "Return ONLY a JSON array of the 3 best repo ids, in order of preference, under the key 'top_repos'. " "Example: {\"top_repos\": [\"repo1\", \"repo2\", \"repo3\"]}" ) user_content = "Here are the repo analyses:\n" + all_info_str response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": selection_prompt}, {"role": "user", "content": user_content} ], max_tokens=256, temperature=0.3 ) selection_json = parse_llm_json_response(response.choices[0].message.content) top_repos = selection_json.get("top_repos", []) return all_info_str, str(top_repos), df except Exception as e: return f"Error in batch analysis: {e}", "", pd.DataFrame() def batch_analyze_and_select_top_for_chat(state): csv_filename = "repo_ids.csv" try: df = read_csv_as_text(csv_filename) all_infos = [] for idx, row in df.iterrows(): repo_id = row["repo id"] try: download_space_repo(repo_id, local_dir="repo_files") txt_path = combine_repo_files_for_llm() llm_output = analyze_combined_file(txt_path) last_start = llm_output.rfind('{') last_end = llm_output.rfind('}') if last_start != -1 and last_end != -1 and last_end > last_start: final_json_str = llm_output[last_start:last_end+1] else: final_json_str = llm_output llm_json = parse_llm_json_response(final_json_str) if isinstance(llm_json, dict) and "error" not in llm_json: df.at[idx, "strength"] = llm_json.get("strength", "") df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "") df.at[idx, "speciality"] = llm_json.get("speciality", "") df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "") all_infos.append({"repo id": repo_id, **llm_json}) except Exception as e: all_infos.append({"repo id": repo_id, "error": str(e)}) df.to_csv(csv_filename, index=False) all_info_str = "\n\n".join([str(info) for info in all_infos]) from openai import OpenAI import os client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") selection_prompt = ( "You are a helpful assistant. You are given a list of repo analyses in JSON format. " "Choose the 3 repos that are the most impressive, relevant, or useful. " "Return ONLY a JSON array of the 3 best repo ids, in order of preference, under the key 'top_repos'. " "Example: {\"top_repos\": [\"repo1\", \"repo2\", \"repo3\"]}" ) user_content = "Here are the repo analyses:\n" + all_info_str response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": selection_prompt}, {"role": "user", "content": user_content} ], max_tokens=256, temperature=0.3 ) selection_json = parse_llm_json_response(response.choices[0].message.content) top_repos = selection_json.get("top_repos", []) # Add a new assistant message to the chat state new_message = ("", f"The top 3 repo IDs are: {', '.join(top_repos)}") if state is None: state = [] state = state + [list(new_message)] return state except Exception as e: new_message = ("", f"Error in batch analysis: {e}") if state is None: state = [] state = state + [list(new_message)] return state with gr.Blocks() as demo: page_state = gr.State(0) # --- Start Page: Option Selection --- with gr.Column(visible=True) as start_page: gr.Markdown("## Welcome! How would you like to proceed?") option_a_btn = gr.Button("A) I know which repos I want to search and research about") option_b_btn = gr.Button("B) I don't know exactly what I want (Chatbot)") # --- Page 1: Input --- with gr.Column(visible=False) as input_page: gr.Markdown("## Enter Keyword or Repo IDs") keyword_input = gr.Textbox(label="Enter keywords to search repos (comma or newline separated)", lines=2, placeholder="e.g. audio, vision\ntext") keyword_btn = gr.Button("Search and Update Repo List") repo_id_box = repo_id_input.render() df_box = df_output.render() submit_btn = gr.Button("Submit Repo IDs") next_btn = gr.Button("Next: Go to Analysis") back_to_start_btn = gr.Button("Back to Start") # --- Page 2: Analysis --- with gr.Column(visible=False) as analysis_page: gr.Markdown("## Combine and Display Repo Files") combine_btn = gr.Button("Download, Combine & Show .py/.md Files from Next Repo and Analyze") combined_txt = gr.Textbox(label="Combined Repo Files", lines=20) llm_output_txt = gr.Textbox(label="LLM Analysis Output", lines=10) df_display = gr.Dataframe( headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], datatype=["str", "str", "str", "str", "str", "str"] ) back_btn = gr.Button("Back to Input") back_to_start_btn2 = gr.Button("Back to Start") # --- Page 3: Chatbot --- with gr.Column(visible=False) as chatbot_page: gr.Markdown("## Repo Recommendation Chatbot") chatbot = gr.Chatbot() state = gr.State([]) user_input = gr.Textbox(label="Your message", placeholder="Describe your ideal repo or answer the assistant's questions...") send_btn = gr.Button("Send") end_btn = gr.Button("End Chat and Extract Keywords") keywords_output = gr.Textbox(label="Extracted Keywords for Repo Search", interactive=False) go_to_results_btn = gr.Button("Find Repos with These Keywords") back_to_start_btn3 = gr.Button("Back to Start") # --- Page 4: Results after Chatbot --- with gr.Column(visible=False) as results_page: gr.Markdown("## Repo Results Based on Your Conversation") results_df = gr.Dataframe( headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], datatype=["str", "str", "str", "str", "str", "str"] ) analyze_next_btn = gr.Button("Download, Combine & Analyze Next Repo") combined_txt_results = gr.Textbox(label="Combined Repo Files", lines=20) llm_output_txt_results = gr.Textbox(label="LLM Analysis Output", lines=10) back_to_start_btn4 = gr.Button("Back to Start") go_to_batch_btn = gr.Button("Go to Batch Analysis Page", visible=True) # --- Page 5: Batch Analysis Page --- with gr.Column(visible=False) as batch_page: gr.Markdown("## Batch Analysis & Top 3 Selection") batch_btn = gr.Button("Batch Analyze All & Select Top 3", visible=True) batch_info_txt = gr.Textbox(label="All Repo Analyses", lines=10) top3_txt = gr.Textbox(label="Top 3 Repo IDs", lines=1) show_top3_chat_btn = gr.Button("Show Top 3 Repo IDs in Chat", visible=True) show_top3_page_btn = gr.Button("Show Top 3 Repos on New Page", visible=True) back_to_results_from_batch_btn = gr.Button("Back to Results") # --- Page 6: Top 3 Repos Page --- with gr.Column(visible=False) as top3_page: gr.Markdown("## Top 3 Recommended Repos") top3_df = gr.Dataframe(headers=["repo id"], datatype=["str"]) back_to_results_btn = gr.Button("Back to Results") # Navigation logic option_a_btn.click(go_to_input, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) option_b_btn.click( lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), [["", CHATBOT_INITIAL_MESSAGE]]), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page, state] ) next_btn.click(go_to_analysis, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page, batch_page, top3_page]) back_btn.click(go_to_input, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page, batch_page, top3_page]) back_to_start_btn.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) back_to_start_btn2.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) back_to_start_btn3.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) back_to_start_btn4.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) go_to_batch_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) back_to_results_from_batch_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) back_to_results_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page]) # Keyword and repo input logic keyword_btn.click(keyword_search_and_update, inputs=keyword_input, outputs=df_box) submit_btn.click(process_repo_input_and_store, inputs=repo_id_box, outputs=df_box) # Analysis logic combine_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt, llm_output_txt, df_display]) # Chatbot logic def user_send(user_message, history): assistant_reply = chat_with_user(user_message, history) history = history + [[user_message, assistant_reply]] return history, history, "" def end_chat(history): keywords = extract_keywords_from_conversation(history) global generated_keywords generated_keywords.clear() generated_keywords.extend([k.strip() for k in keywords.split(",") if k.strip()]) return keywords def go_to_results_from_chatbot(keywords): # Use the keywords to search and update the CSV, then display the DataFrame df = use_keywords_to_search_and_update_csv(keywords) return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), df send_btn.click(user_send, inputs=[user_input, state], outputs=[chatbot, state, user_input]) end_btn.click(end_chat, inputs=state, outputs=keywords_output) go_to_results_btn.click( go_to_results_from_chatbot, inputs=keywords_output, outputs=[chatbot_page, input_page, analysis_page, results_page, batch_page, top3_page, results_df] ) # Add logic for the new button on results_page analyze_next_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt_results, llm_output_txt_results, results_df]) batch_btn.click(batch_analyze_and_select_top, inputs=None, outputs=[batch_info_txt, top3_txt, df_output]) show_top3_chat_btn.click(batch_analyze_and_select_top_for_chat, inputs=[state], outputs=[state]) def show_top3_page(): # Run batch analysis, get top 3, save to CSV, and return DataFrame all_info_str, top3_str, df = batch_analyze_and_select_top() import pandas as pd import ast try: top3_ids = ast.literal_eval(top3_str) if isinstance(top3_ids, str): top3_ids = [top3_ids] except Exception: top3_ids = [] top3_df_data = pd.DataFrame({"repo id": top3_ids}) top3_df_data.to_csv("top3_repos.csv", index=False) return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), top3_df_data show_top3_page_btn.click(show_top3_page, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page, top3_df]) demo.launch()