|
import gradio as gr |
|
import regex as re |
|
import csv |
|
import pandas as pd |
|
from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response |
|
from hf_utils import download_space_repo, search_top_spaces |
|
from chatbot_page import chat_with_user, extract_keywords_from_conversation |
|
|
|
from analyzer import analyze_code |
|
|
|
|
|
CHATBOT_SYSTEM_PROMPT = ( |
|
"You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. " |
|
"Ask questions to clarify what they want, their use case, preferred language, features, etc. " |
|
"When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. " |
|
"Return only the keywords as a comma-separated list." |
|
) |
|
|
|
def read_csv_as_text(csv_filename): |
|
return pd.read_csv(csv_filename, dtype=str) |
|
|
|
def process_repo_input(text): |
|
if not text: |
|
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
|
|
repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()] |
|
|
|
csv_filename = "repo_ids.csv" |
|
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: |
|
writer = csv.writer(csvfile) |
|
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
for repo_id in repo_ids: |
|
writer.writerow([repo_id, "", "", "", ""]) |
|
|
|
df = read_csv_as_text(csv_filename) |
|
return df |
|
|
|
|
|
last_repo_ids = [] |
|
current_repo_idx = 0 |
|
|
|
|
|
generated_keywords = [] |
|
|
|
def process_repo_input_and_store(text): |
|
global last_repo_ids, current_repo_idx |
|
if not text: |
|
last_repo_ids = [] |
|
current_repo_idx = 0 |
|
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()] |
|
last_repo_ids = repo_ids |
|
current_repo_idx = 0 |
|
csv_filename = "repo_ids.csv" |
|
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: |
|
writer = csv.writer(csvfile) |
|
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
for repo_id in repo_ids: |
|
writer.writerow([repo_id, "", "", "", ""]) |
|
df = read_csv_as_text(csv_filename) |
|
return df |
|
|
|
def keyword_search_and_update(keyword): |
|
global last_repo_ids, current_repo_idx |
|
if not keyword: |
|
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
|
|
keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()] |
|
repo_ids = [] |
|
for kw in keyword_list: |
|
repo_ids.extend(search_top_spaces(kw, limit=5)) |
|
|
|
seen = set() |
|
unique_repo_ids = [] |
|
for rid in repo_ids: |
|
if rid not in seen: |
|
unique_repo_ids.append(rid) |
|
seen.add(rid) |
|
last_repo_ids = unique_repo_ids |
|
current_repo_idx = 0 |
|
csv_filename = "repo_ids.csv" |
|
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: |
|
writer = csv.writer(csvfile) |
|
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
for repo_id in unique_repo_ids: |
|
writer.writerow([repo_id, "", "", "", ""]) |
|
df = read_csv_as_text(csv_filename) |
|
return df |
|
|
|
def show_combined_repo_and_llm(): |
|
global current_repo_idx |
|
if not last_repo_ids: |
|
return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame() |
|
if current_repo_idx >= len(last_repo_ids): |
|
return "All repo IDs have been processed.", "", read_csv_as_text("repo_ids.csv") |
|
repo_id = last_repo_ids[current_repo_idx] |
|
try: |
|
download_space_repo(repo_id, local_dir="repo_files") |
|
except Exception as e: |
|
return f"Error downloading repo: {e}", "", read_csv_as_text("repo_ids.csv") |
|
txt_path = combine_repo_files_for_llm() |
|
try: |
|
with open(txt_path, "r", encoding="utf-8") as f: |
|
combined_content = f.read() |
|
except Exception as e: |
|
return f"Error reading {txt_path}: {e}", "", read_csv_as_text("repo_ids.csv") |
|
llm_output = analyze_combined_file(txt_path) |
|
|
|
last_start = llm_output.rfind('{') |
|
last_end = llm_output.rfind('}') |
|
if last_start != -1 and last_end != -1 and last_end > last_start: |
|
final_json_str = llm_output[last_start:last_end+1] |
|
else: |
|
final_json_str = llm_output |
|
llm_json = parse_llm_json_response(final_json_str) |
|
|
|
csv_filename = "repo_ids.csv" |
|
extraction_status = "" |
|
strengths = "" |
|
weaknesses = "" |
|
try: |
|
df = read_csv_as_text(csv_filename) |
|
for col in ["strength", "weaknesses", "speciality", "relevance rating"]: |
|
df[col] = df[col].astype(str) |
|
updated = False |
|
for idx, row in df.iterrows(): |
|
if row["repo id"] == repo_id: |
|
if isinstance(llm_json, dict) and "error" not in llm_json: |
|
extraction_status = "JSON extraction: SUCCESS" |
|
strengths = llm_json.get("strength", "") |
|
weaknesses = llm_json.get("weaknesses", "") |
|
df.at[idx, "strength"] = strengths |
|
df.at[idx, "weaknesses"] = weaknesses |
|
df.at[idx, "speciality"] = llm_json.get("speciality", "") |
|
df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "") |
|
updated = True |
|
else: |
|
extraction_status = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}" |
|
break |
|
|
|
if not updated and isinstance(llm_json, dict) and "error" not in llm_json: |
|
extraction_status = "JSON extraction: SUCCESS (new row)" |
|
strengths = llm_json.get("strength", "") |
|
weaknesses = llm_json.get("weaknesses", "") |
|
new_row = { |
|
"repo id": repo_id, |
|
"strength": strengths, |
|
"weaknesses": weaknesses, |
|
"speciality": llm_json.get("speciality", ""), |
|
"relevance rating": llm_json.get("relevance rating", "") |
|
} |
|
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) |
|
df.to_csv(csv_filename, index=False) |
|
except Exception as e: |
|
df = read_csv_as_text(csv_filename) |
|
extraction_status = f"CSV update error: {e}" |
|
|
|
current_repo_idx += 1 |
|
summary = f"{extraction_status}\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}" |
|
return combined_content, summary, df |
|
|
|
def go_to_analysis(): |
|
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) |
|
|
|
def go_to_input(): |
|
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) |
|
|
|
def go_to_chatbot(): |
|
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) |
|
|
|
def go_to_start(): |
|
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) |
|
|
|
def go_to_results(): |
|
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) |
|
|
|
repo_id_input = gr.Textbox(label="Enter repo IDs (comma or newline separated)", lines=5, placeholder="repo1, repo2\nrepo3") |
|
df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], |
|
datatype=["str", "str", "str", "str", "str", "str"] |
|
) |
|
|
|
|
|
|
|
|
|
def use_keywords_to_search_and_update_csv(keywords): |
|
global last_repo_ids, current_repo_idx |
|
if not keywords: |
|
return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
|
|
keyword_list = [k.strip() for k in keywords.split(",") if k.strip()] |
|
repo_ids = [] |
|
for kw in keyword_list: |
|
repo_ids.extend(search_top_spaces(kw, limit=3)) |
|
|
|
seen = set() |
|
unique_repo_ids = [] |
|
for rid in repo_ids: |
|
if rid not in seen: |
|
unique_repo_ids.append(rid) |
|
seen.add(rid) |
|
last_repo_ids = unique_repo_ids |
|
current_repo_idx = 0 |
|
csv_filename = "repo_ids.csv" |
|
with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: |
|
writer = csv.writer(csvfile) |
|
writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) |
|
for repo_id in unique_repo_ids: |
|
writer.writerow([repo_id, "", "", "", ""]) |
|
df = read_csv_as_text(csv_filename) |
|
return df |
|
|
|
with gr.Blocks() as demo: |
|
page_state = gr.State(0) |
|
|
|
|
|
with gr.Column(visible=True) as start_page: |
|
gr.Markdown("## Welcome! How would you like to proceed?") |
|
option_a_btn = gr.Button("A) I know which repos I want to search and research about") |
|
option_b_btn = gr.Button("B) I don't know exactly what I want (Chatbot)") |
|
|
|
|
|
with gr.Column(visible=False) as input_page: |
|
gr.Markdown("## Enter Keyword or Repo IDs") |
|
keyword_input = gr.Textbox(label="Enter keywords to search repos (comma or newline separated)", lines=2, placeholder="e.g. audio, vision\ntext") |
|
keyword_btn = gr.Button("Search and Update Repo List") |
|
repo_id_box = repo_id_input.render() |
|
df_box = df_output.render() |
|
submit_btn = gr.Button("Submit Repo IDs") |
|
next_btn = gr.Button("Next: Go to Analysis") |
|
back_to_start_btn = gr.Button("Back to Start") |
|
|
|
|
|
with gr.Column(visible=False) as analysis_page: |
|
gr.Markdown("## Combine and Display Repo Files") |
|
combine_btn = gr.Button("Download, Combine & Show .py/.md Files from Next Repo and Analyze") |
|
combined_txt = gr.Textbox(label="Combined Repo Files", lines=20) |
|
llm_output_txt = gr.Textbox(label="LLM Analysis Output", lines=10) |
|
df_display = gr.Dataframe( |
|
headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], |
|
datatype=["str", "str", "str", "str", "str", "str"] |
|
) |
|
back_btn = gr.Button("Back to Input") |
|
back_to_start_btn2 = gr.Button("Back to Start") |
|
|
|
|
|
with gr.Column(visible=False) as chatbot_page: |
|
gr.Markdown("## Repo Recommendation Chatbot") |
|
chatbot = gr.Chatbot() |
|
state = gr.State([]) |
|
user_input = gr.Textbox(label="Your message", placeholder="Describe your ideal repo or answer the assistant's questions...") |
|
send_btn = gr.Button("Send") |
|
end_btn = gr.Button("End Chat and Extract Keywords") |
|
keywords_output = gr.Textbox(label="Extracted Keywords for Repo Search", interactive=False) |
|
go_to_results_btn = gr.Button("Find Repos with These Keywords") |
|
back_to_start_btn3 = gr.Button("Back to Start") |
|
|
|
|
|
with gr.Column(visible=False) as results_page: |
|
gr.Markdown("## Repo Results Based on Your Conversation") |
|
results_df = gr.Dataframe( |
|
headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], |
|
datatype=["str", "str", "str", "str", "str", "str"] |
|
) |
|
analyze_next_btn = gr.Button("Download, Combine & Analyze Next Repo") |
|
combined_txt_results = gr.Textbox(label="Combined Repo Files", lines=20) |
|
llm_output_txt_results = gr.Textbox(label="LLM Analysis Output", lines=10) |
|
back_to_start_btn4 = gr.Button("Back to Start") |
|
|
|
|
|
option_a_btn.click(go_to_input, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) |
|
option_b_btn.click(go_to_chatbot, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) |
|
next_btn.click(go_to_analysis, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page]) |
|
back_btn.click(go_to_input, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page]) |
|
back_to_start_btn.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) |
|
back_to_start_btn2.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) |
|
back_to_start_btn3.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) |
|
back_to_start_btn4.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) |
|
|
|
|
|
keyword_btn.click(keyword_search_and_update, inputs=keyword_input, outputs=df_box) |
|
submit_btn.click(process_repo_input_and_store, inputs=repo_id_box, outputs=df_box) |
|
|
|
|
|
combine_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt, llm_output_txt, df_display]) |
|
|
|
|
|
def user_send(user_message, history): |
|
assistant_reply = chat_with_user(user_message, history) |
|
history = history + [[user_message, assistant_reply]] |
|
return history, history, "" |
|
|
|
def end_chat(history): |
|
keywords = extract_keywords_from_conversation(history) |
|
global generated_keywords |
|
generated_keywords.clear() |
|
generated_keywords.extend([k.strip() for k in keywords.split(",") if k.strip()]) |
|
return keywords |
|
|
|
def go_to_results_from_chatbot(keywords): |
|
|
|
df = use_keywords_to_search_and_update_csv(keywords) |
|
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), df |
|
|
|
send_btn.click(user_send, inputs=[user_input, state], outputs=[chatbot, state, user_input]) |
|
end_btn.click(end_chat, inputs=state, outputs=keywords_output) |
|
go_to_results_btn.click( |
|
go_to_results_from_chatbot, |
|
inputs=keywords_output, |
|
outputs=[chatbot_page, input_page, analysis_page, results_page, results_df] |
|
) |
|
|
|
|
|
analyze_next_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt_results, llm_output_txt_results, results_df]) |
|
|
|
demo.launch() |