HF_RepoSense / app.py
naman1102's picture
Update app.py
6673deb
raw
history blame
13.9 kB
import gradio as gr
import regex as re
import csv
import pandas as pd
from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
from hf_utils import download_space_repo, search_top_spaces
from chatbot_page import chat_with_user, extract_keywords_from_conversation
# Import chatbot logic
from analyzer import analyze_code
# Chatbot system prompt
CHATBOT_SYSTEM_PROMPT = (
"You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
"Ask questions to clarify what they want, their use case, preferred language, features, etc. "
"When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
"Return only the keywords as a comma-separated list."
)
def read_csv_as_text(csv_filename):
return pd.read_csv(csv_filename, dtype=str)
def process_repo_input(text):
if not text:
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
# Split by newlines and commas, strip whitespace
repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
# Write to CSV
csv_filename = "repo_ids.csv"
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
for repo_id in repo_ids:
writer.writerow([repo_id, "", "", "", ""])
# Read the CSV into a DataFrame to display
df = read_csv_as_text(csv_filename)
return df
# Store the last entered repo ids and the current index in global variables for button access
last_repo_ids = []
current_repo_idx = 0
# Store extracted keywords for the chatbot flow
generated_keywords = []
def process_repo_input_and_store(text):
global last_repo_ids, current_repo_idx
if not text:
last_repo_ids = []
current_repo_idx = 0
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
last_repo_ids = repo_ids
current_repo_idx = 0
csv_filename = "repo_ids.csv"
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
for repo_id in repo_ids:
writer.writerow([repo_id, "", "", "", ""])
df = read_csv_as_text(csv_filename)
return df
def keyword_search_and_update(keyword):
global last_repo_ids, current_repo_idx
if not keyword:
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
# Accept multiple keywords, comma or newline separated
keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()]
repo_ids = []
for kw in keyword_list:
repo_ids.extend(search_top_spaces(kw, limit=5))
# Remove duplicates while preserving order
seen = set()
unique_repo_ids = []
for rid in repo_ids:
if rid not in seen:
unique_repo_ids.append(rid)
seen.add(rid)
last_repo_ids = unique_repo_ids
current_repo_idx = 0
csv_filename = "repo_ids.csv"
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
for repo_id in unique_repo_ids:
writer.writerow([repo_id, "", "", "", ""])
df = read_csv_as_text(csv_filename)
return df
def show_combined_repo_and_llm():
global current_repo_idx
if not last_repo_ids:
return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame()
if current_repo_idx >= len(last_repo_ids):
return "All repo IDs have been processed.", "", read_csv_as_text("repo_ids.csv")
repo_id = last_repo_ids[current_repo_idx]
try:
download_space_repo(repo_id, local_dir="repo_files")
except Exception as e:
return f"Error downloading repo: {e}", "", read_csv_as_text("repo_ids.csv")
txt_path = combine_repo_files_for_llm()
try:
with open(txt_path, "r", encoding="utf-8") as f:
combined_content = f.read()
except Exception as e:
return f"Error reading {txt_path}: {e}", "", read_csv_as_text("repo_ids.csv")
llm_output = analyze_combined_file(txt_path)
llm_json = parse_llm_json_response(llm_output)
# Update CSV for the current repo id
csv_filename = "repo_ids.csv"
extraction_status = ""
strengths = ""
weaknesses = ""
try:
df = read_csv_as_text(csv_filename)
for col in ["strength", "weaknesses", "speciality", "relevance rating"]:
df[col] = df[col].astype(str)
for idx, row in df.iterrows():
if row["repo id"] == repo_id:
if isinstance(llm_json, dict) and "error" not in llm_json:
extraction_status = "JSON extraction: SUCCESS"
strengths = llm_json.get("strength", "")
weaknesses = llm_json.get("weaknesses", "")
df.at[idx, "strength"] = strengths
df.at[idx, "weaknesses"] = weaknesses
df.at[idx, "speciality"] = llm_json.get("speciality", "")
df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
else:
extraction_status = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}"
break
df.to_csv(csv_filename, index=False)
except Exception as e:
df = read_csv_as_text(csv_filename)
extraction_status = f"CSV update error: {e}"
# Move to next repo for next click
current_repo_idx += 1
summary = f"{extraction_status}\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
return combined_content, summary, df
def go_to_analysis():
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
def go_to_input():
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
def go_to_chatbot():
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
def go_to_start():
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
def go_to_results():
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
repo_id_input = gr.Textbox(label="Enter repo IDs (comma or newline separated)", lines=5, placeholder="repo1, repo2\nrepo3")
df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
datatype=["str", "str", "str", "str", "str", "str"]
)
def use_keywords_to_search_and_update_csv(keywords):
global last_repo_ids, current_repo_idx
if not keywords:
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
# Split keywords and search for each
keyword_list = [k.strip() for k in keywords.split(",") if k.strip()]
repo_ids = []
for kw in keyword_list:
repo_ids.extend(search_top_spaces(kw, limit=3)) # limit=3 per keyword
# Remove duplicates while preserving order
seen = set()
unique_repo_ids = []
for rid in repo_ids:
if rid not in seen:
unique_repo_ids.append(rid)
seen.add(rid)
last_repo_ids = unique_repo_ids
current_repo_idx = 0
csv_filename = "repo_ids.csv"
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
for repo_id in unique_repo_ids:
writer.writerow([repo_id, "", "", "", ""])
df = read_csv_as_text(csv_filename)
return df
with gr.Blocks() as demo:
page_state = gr.State(0)
# --- Start Page: Option Selection ---
with gr.Column(visible=True) as start_page:
gr.Markdown("## Welcome! How would you like to proceed?")
option_a_btn = gr.Button("A) I know which repos I want to search and research about")
option_b_btn = gr.Button("B) I don't know exactly what I want (Chatbot)")
# --- Page 1: Input ---
with gr.Column(visible=False) as input_page:
gr.Markdown("## Enter Keyword or Repo IDs")
keyword_input = gr.Textbox(label="Enter keywords to search repos (comma or newline separated)", lines=2, placeholder="e.g. audio, vision\ntext")
keyword_btn = gr.Button("Search and Update Repo List")
repo_id_box = repo_id_input.render()
df_box = df_output.render()
submit_btn = gr.Button("Submit Repo IDs")
next_btn = gr.Button("Next: Go to Analysis")
back_to_start_btn = gr.Button("Back to Start")
# --- Page 2: Analysis ---
with gr.Column(visible=False) as analysis_page:
gr.Markdown("## Combine and Display Repo Files")
combine_btn = gr.Button("Download, Combine & Show .py/.md Files from Next Repo and Analyze")
combined_txt = gr.Textbox(label="Combined Repo Files", lines=20)
llm_output_txt = gr.Textbox(label="LLM Analysis Output", lines=10)
df_display = gr.Dataframe(
headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
datatype=["str", "str", "str", "str", "str", "str"]
)
back_btn = gr.Button("Back to Input")
back_to_start_btn2 = gr.Button("Back to Start")
# --- Page 3: Chatbot ---
with gr.Column(visible=False) as chatbot_page:
gr.Markdown("## Repo Recommendation Chatbot")
chatbot = gr.Chatbot()
state = gr.State([])
user_input = gr.Textbox(label="Your message", placeholder="Describe your ideal repo or answer the assistant's questions...")
send_btn = gr.Button("Send")
end_btn = gr.Button("End Chat and Extract Keywords")
keywords_output = gr.Textbox(label="Extracted Keywords for Repo Search", interactive=False)
go_to_results_btn = gr.Button("Find Repos with These Keywords")
back_to_start_btn3 = gr.Button("Back to Start")
# --- Page 4: Results after Chatbot ---
with gr.Column(visible=False) as results_page:
gr.Markdown("## Repo Results Based on Your Conversation")
results_df = gr.Dataframe(
headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
datatype=["str", "str", "str", "str", "str", "str"]
)
analyze_next_btn = gr.Button("Download, Combine & Analyze Next Repo")
combined_txt_results = gr.Textbox(label="Combined Repo Files", lines=20)
llm_output_txt_results = gr.Textbox(label="LLM Analysis Output", lines=10)
back_to_start_btn4 = gr.Button("Back to Start")
# Navigation logic
option_a_btn.click(go_to_input, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page])
option_b_btn.click(go_to_chatbot, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page])
next_btn.click(go_to_analysis, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page])
back_btn.click(go_to_input, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page])
back_to_start_btn.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page])
back_to_start_btn2.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page])
back_to_start_btn3.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page])
back_to_start_btn4.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page])
# Keyword and repo input logic
keyword_btn.click(keyword_search_and_update, inputs=keyword_input, outputs=df_box)
submit_btn.click(process_repo_input_and_store, inputs=repo_id_box, outputs=df_box)
# Analysis logic
combine_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt, llm_output_txt, df_display])
# Chatbot logic
def user_send(user_message, history):
assistant_reply = chat_with_user(user_message, history)
history = history + [[user_message, assistant_reply]]
return history, history, ""
def end_chat(history):
keywords = extract_keywords_from_conversation(history)
global generated_keywords
generated_keywords.clear()
generated_keywords.extend([k.strip() for k in keywords.split(",") if k.strip()])
return keywords
def go_to_results_from_chatbot(keywords):
# Use the keywords to search and update the CSV, then display the DataFrame
df = use_keywords_to_search_and_update_csv(keywords)
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), df
send_btn.click(user_send, inputs=[user_input, state], outputs=[chatbot, state, user_input])
end_btn.click(end_chat, inputs=state, outputs=keywords_output)
go_to_results_btn.click(
go_to_results_from_chatbot,
inputs=keywords_output,
outputs=[chatbot_page, input_page, analysis_page, results_page, results_df]
)
# Add logic for the new button on results_page
analyze_next_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt_results, llm_output_txt_results, results_df])
demo.launch()