HF_RepoSense

Sleeping

App Files Files Community

HF_RepoSense / app_old.py

naman1102

new

adcb6a8 6 months ago

raw

history blame

24.7 kB

	import gradio as gr
	import regex as re
	import csv
	import pandas as pd
	from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
	from hf_utils import download_filtered_space_files, search_top_spaces
	from chatbot_page import chat_with_user, extract_keywords_from_conversation
	# Import chatbot logic
	from analyzer import analyze_code

	# Chatbot system prompt
	CHATBOT_SYSTEM_PROMPT = (
	"You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
	"Ask questions to clarify what they want, their use case, preferred language, features, etc. "
	"When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
	"Return only the keywords as a comma-separated list."
	)

	# Initial assistant message for chatbot
	CHATBOT_INITIAL_MESSAGE = "Hello! Please tell me about your ideal Hugging Face repo. What use case, preferred language, or features are you looking for?"

	def read_csv_as_text(csv_filename):
	return pd.read_csv(csv_filename, dtype=str)

	def process_repo_input(text):
	if not text:
	return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	# Split by newlines and commas, strip whitespace
	repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
	# Write to CSV
	csv_filename = "repo_ids.csv"
	with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	for repo_id in repo_ids:
	writer.writerow([repo_id, "", "", "", ""])
	# Read the CSV into a DataFrame to display
	df = read_csv_as_text(csv_filename)
	return df

	# Store the last entered repo ids and the current index in global variables for button access
	last_repo_ids = []
	current_repo_idx = 0

	# Store extracted keywords for the chatbot flow
	generated_keywords = []

	def process_repo_input_and_store(text):
	global last_repo_ids, current_repo_idx
	if not text:
	last_repo_ids = []
	current_repo_idx = 0
	return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
	last_repo_ids = repo_ids
	current_repo_idx = 0
	csv_filename = "repo_ids.csv"
	with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	for repo_id in repo_ids:
	writer.writerow([repo_id, "", "", "", ""])
	df = read_csv_as_text(csv_filename)
	return df

	def keyword_search_and_update(keyword):
	global last_repo_ids, current_repo_idx
	if not keyword:
	return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	# Accept multiple keywords, comma or newline separated
	keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()]
	repo_ids = []
	for kw in keyword_list:
	repo_ids.extend(search_top_spaces(kw, limit=5))
	# Remove duplicates while preserving order
	seen = set()
	unique_repo_ids = []
	for rid in repo_ids:
	if rid not in seen:
	unique_repo_ids.append(rid)
	seen.add(rid)
	last_repo_ids = unique_repo_ids
	current_repo_idx = 0
	csv_filename = "repo_ids.csv"
	with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	for repo_id in unique_repo_ids:
	writer.writerow([repo_id, "", "", "", ""])
	df = read_csv_as_text(csv_filename)
	return df

	def show_combined_repo_and_llm():
	global current_repo_idx
	if not last_repo_ids:
	return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame()
	if current_repo_idx >= len(last_repo_ids):
	return "All repo IDs have been processed.", "", read_csv_as_text("repo_ids.csv")
	repo_id = last_repo_ids[current_repo_idx]
	try:
	download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=[".py", ".md", ".txt"])
	except Exception as e:
	return f"Error downloading repo: {e}", "", read_csv_as_text("repo_ids.csv")
	txt_path = combine_repo_files_for_llm()
	try:
	with open(txt_path, "r", encoding="utf-8") as f:
	combined_content = f.read()
	except Exception as e:
	return f"Error reading {txt_path}: {e}", "", read_csv_as_text("repo_ids.csv")
	llm_output = analyze_combined_file(txt_path)
	# Extract only the last JSON object (final summary) for CSV writing
	last_start = llm_output.rfind('{')
	last_end = llm_output.rfind('}')
	if last_start != -1 and last_end != -1 and last_end > last_start:
	final_json_str = llm_output[last_start:last_end+1]
	else:
	final_json_str = llm_output
	llm_json = parse_llm_json_response(final_json_str)
	# Update CSV for the current repo id
	csv_filename = "repo_ids.csv"
	extraction_status = ""
	strengths = ""
	weaknesses = ""
	try:
	df = read_csv_as_text(csv_filename)
	for col in ["strength", "weaknesses", "speciality", "relevance rating"]:
	df[col] = df[col].astype(str)
	updated = False
	for idx, row in df.iterrows():
	if row["repo id"] == repo_id:
	if isinstance(llm_json, dict) and "error" not in llm_json:
	extraction_status = "JSON extraction: SUCCESS"
	strengths = llm_json.get("strength", "")
	weaknesses = llm_json.get("weaknesses", "")
	df.at[idx, "strength"] = strengths
	df.at[idx, "weaknesses"] = weaknesses
	df.at[idx, "speciality"] = llm_json.get("speciality", "")
	df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
	updated = True
	else:
	extraction_status = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}"
	break
	# If not updated (repo_id not found), append a new row
	if not updated and isinstance(llm_json, dict) and "error" not in llm_json:
	extraction_status = "JSON extraction: SUCCESS (new row)"
	strengths = llm_json.get("strength", "")
	weaknesses = llm_json.get("weaknesses", "")
	new_row = {
	"repo id": repo_id,
	"strength": strengths,
	"weaknesses": weaknesses,
	"speciality": llm_json.get("speciality", ""),
	"relevance rating": llm_json.get("relevance rating", "")
	}
	df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
	df.to_csv(csv_filename, index=False)
	except Exception as e:
	df = read_csv_as_text(csv_filename)
	extraction_status = f"CSV update error: {e}"
	# Move to next repo for next click
	current_repo_idx += 1
	summary = f"{extraction_status}\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
	return combined_content, summary, df

	def go_to_analysis():
	return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)

	def go_to_input():
	return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)

	def go_to_chatbot():
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)

	def go_to_start():
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

	def go_to_results():
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)

	repo_id_input = gr.Textbox(label="Enter repo IDs (comma or newline separated)", lines=5, placeholder="repo1, repo2\nrepo3")
	df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
	datatype=["str", "str", "str", "str", "str", "str"]
	)




	def use_keywords_to_search_and_update_csv(keywords):
	global last_repo_ids, current_repo_idx
	if not keywords:
	return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	# Split keywords and search for each
	keyword_list = [k.strip() for k in keywords.split(",") if k.strip()]
	repo_ids = []
	for kw in keyword_list:
	repo_ids.extend(search_top_spaces(kw, limit=3)) # limit=3 per keyword
	# Remove duplicates while preserving order
	seen = set()
	unique_repo_ids = []
	for rid in repo_ids:
	if rid not in seen:
	unique_repo_ids.append(rid)
	seen.add(rid)
	last_repo_ids = unique_repo_ids
	current_repo_idx = 0
	csv_filename = "repo_ids.csv"
	with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
	for repo_id in unique_repo_ids:
	writer.writerow([repo_id, "", "", "", ""])
	df = read_csv_as_text(csv_filename)
	return df

	def batch_analyze_and_select_top():
	csv_filename = "repo_ids.csv"
	try:
	df = read_csv_as_text(csv_filename)
	all_infos = []
	# Analyze each repo and update CSV
	for idx, row in df.iterrows():
	repo_id = row["repo id"]
	try:
	download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=[".py", ".md", ".txt"])
	txt_path = combine_repo_files_for_llm()
	llm_output = analyze_combined_file(txt_path)
	last_start = llm_output.rfind('{')
	last_end = llm_output.rfind('}')
	if last_start != -1 and last_end != -1 and last_end > last_start:
	final_json_str = llm_output[last_start:last_end+1]
	else:
	final_json_str = llm_output
	llm_json = parse_llm_json_response(final_json_str)
	if isinstance(llm_json, dict) and "error" not in llm_json:
	df.at[idx, "strength"] = llm_json.get("strength", "")
	df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "")
	df.at[idx, "speciality"] = llm_json.get("speciality", "")
	df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
	all_infos.append({"repo id": repo_id, **llm_json})
	except Exception as e:
	all_infos.append({"repo id": repo_id, "error": str(e)})
	df.to_csv(csv_filename, index=False)
	# Display all info
	all_info_str = "\n\n".join([str(info) for info in all_infos])
	# Let LLM choose the best 3
	from openai import OpenAI
	import os
	client = OpenAI(api_key=os.getenv("modal_api"))
	client.base_url = os.getenv("base_url")
	selection_prompt = (
	"You are a helpful assistant. You are given a list of repo analyses in JSON format. "
	"Choose the 3 repos that are the most impressive, relevant, or useful. "
	"Return ONLY a JSON array of the 3 best repo ids, in order of preference, under the key 'top_repos'. "
	"Example: {\"top_repos\": [\"repo1\", \"repo2\", \"repo3\"]}"
	)
	user_content = "Here are the repo analyses:\n" + all_info_str
	response = client.chat.completions.create(
	model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
	messages=[
	{"role": "system", "content": selection_prompt},
	{"role": "user", "content": user_content}
	],
	max_tokens=256,
	temperature=0.3
	)
	selection_json = parse_llm_json_response(response.choices[0].message.content)
	top_repos = selection_json.get("top_repos", [])
	return all_info_str, str(top_repos), df
	except Exception as e:
	return f"Error in batch analysis: {e}", "", pd.DataFrame()

	def batch_analyze_and_select_top_for_chat(state):
	csv_filename = "repo_ids.csv"
	try:
	df = read_csv_as_text(csv_filename)
	all_infos = []
	for idx, row in df.iterrows():
	repo_id = row["repo id"]
	try:
	download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=[".py", ".md", ".txt"])
	txt_path = combine_repo_files_for_llm()
	llm_output = analyze_combined_file(txt_path)
	last_start = llm_output.rfind('{')
	last_end = llm_output.rfind('}')
	if last_start != -1 and last_end != -1 and last_end > last_start:
	final_json_str = llm_output[last_start:last_end+1]
	else:
	final_json_str = llm_output
	llm_json = parse_llm_json_response(final_json_str)
	if isinstance(llm_json, dict) and "error" not in llm_json:
	df.at[idx, "strength"] = llm_json.get("strength", "")
	df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "")
	df.at[idx, "speciality"] = llm_json.get("speciality", "")
	df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
	all_infos.append({"repo id": repo_id, **llm_json})
	except Exception as e:
	all_infos.append({"repo id": repo_id, "error": str(e)})
	df.to_csv(csv_filename, index=False)
	all_info_str = "\n\n".join([str(info) for info in all_infos])
	from openai import OpenAI
	import os
	client = OpenAI(api_key=os.getenv("modal_api"))
	client.base_url = os.getenv("base_url")
	selection_prompt = (
	"You are a helpful assistant. You are given a list of repo analyses in JSON format. "
	"Choose the 3 repos that are the most impressive, relevant, or useful. "
	"Return ONLY a JSON array of the 3 best repo ids, in order of preference, under the key 'top_repos'. "
	"Example: {\"top_repos\": [\"repo1\", \"repo2\", \"repo3\"]}"
	)
	user_content = "Here are the repo analyses:\n" + all_info_str
	response = client.chat.completions.create(
	model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
	messages=[
	{"role": "system", "content": selection_prompt},
	{"role": "user", "content": user_content}
	],
	max_tokens=256,
	temperature=0.3
	)
	selection_json = parse_llm_json_response(response.choices[0].message.content)
	top_repos = selection_json.get("top_repos", [])
	# Add a new assistant message to the chat state
	new_message = ("", f"The top 3 repo IDs are: {', '.join(top_repos)}")
	if state is None:
	state = []
	state = state + [list(new_message)]
	return state
	except Exception as e:
	new_message = ("", f"Error in batch analysis: {e}")
	if state is None:
	state = []
	state = state + [list(new_message)]
	return state

	with gr.Blocks() as demo:
	page_state = gr.State(0)

	# --- Start Page: Option Selection ---
	with gr.Column(visible=True) as start_page:
	gr.Markdown("## Welcome! How would you like to proceed?")
	option_a_btn = gr.Button("A) I know which repos I want to search and research about")
	option_b_btn = gr.Button("B) I don't know exactly what I want (Chatbot)")

	# --- Page 1: Input ---
	with gr.Column(visible=False) as input_page:
	gr.Markdown("## Enter Keyword or Repo IDs")
	keyword_input = gr.Textbox(label="Enter keywords to search repos (comma or newline separated)", lines=2, placeholder="e.g. audio, vision\ntext")
	keyword_btn = gr.Button("Search and Update Repo List")
	repo_id_box = repo_id_input.render()
	df_box = df_output.render()
	submit_btn = gr.Button("Submit Repo IDs")
	next_btn = gr.Button("Next: Go to Analysis")
	back_to_start_btn = gr.Button("Back to Start")

	# --- Page 2: Analysis ---
	with gr.Column(visible=False) as analysis_page:
	gr.Markdown("## Combine and Display Repo Files")
	combine_btn = gr.Button("Download, Combine & Show .py/.md Files from Next Repo and Analyze")
	combined_txt = gr.Textbox(label="Combined Repo Files", lines=20)
	llm_output_txt = gr.Textbox(label="LLM Analysis Output", lines=10)
	df_display = gr.Dataframe(
	headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
	datatype=["str", "str", "str", "str", "str", "str"]
	)
	back_btn = gr.Button("Back to Input")
	back_to_start_btn2 = gr.Button("Back to Start")

	# --- Page 3: Chatbot ---
	with gr.Column(visible=False) as chatbot_page:
	gr.Markdown("## Repo Recommendation Chatbot")
	chatbot = gr.Chatbot()
	state = gr.State([])
	user_input = gr.Textbox(label="Your message", placeholder="Describe your ideal repo or answer the assistant's questions...")
	send_btn = gr.Button("Send")
	end_btn = gr.Button("End Chat and Extract Keywords")
	keywords_output = gr.Textbox(label="Extracted Keywords for Repo Search", interactive=False)
	go_to_results_btn = gr.Button("Find Repos with These Keywords")
	back_to_start_btn3 = gr.Button("Back to Start")

	# --- Page 4: Results after Chatbot ---
	with gr.Column(visible=False) as results_page:
	gr.Markdown("## Repo Results Based on Your Conversation")
	results_df = gr.Dataframe(
	headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
	datatype=["str", "str", "str", "str", "str", "str"]
	)
	analyze_next_btn = gr.Button("Download, Combine & Analyze Next Repo")
	combined_txt_results = gr.Textbox(label="Combined Repo Files", lines=20)
	llm_output_txt_results = gr.Textbox(label="LLM Analysis Output", lines=10)
	back_to_start_btn4 = gr.Button("Back to Start")
	go_to_batch_btn = gr.Button("Go to Batch Analysis Page", visible=True)

	# --- Page 5: Batch Analysis Page ---
	with gr.Column(visible=False) as batch_page:
	gr.Markdown("## Batch Analysis & Top 3 Selection")
	batch_btn = gr.Button("Batch Analyze All & Select Top 3", visible=True)
	batch_info_txt = gr.Textbox(label="All Repo Analyses", lines=10)
	top3_txt = gr.Textbox(label="Top 3 Repo IDs", lines=1)
	show_top3_chat_btn = gr.Button("Show Top 3 Repo IDs in Chat", visible=True)
	show_top3_page_btn = gr.Button("Show Top 3 Repos on New Page", visible=True)
	back_to_results_from_batch_btn = gr.Button("Back to Results")

	# --- Page 6: Top 3 Repos Page ---
	with gr.Column(visible=False) as top3_page:
	gr.Markdown("## Top 3 Recommended Repos")
	top3_df = gr.Dataframe(headers=["repo id"], datatype=["str"])
	back_to_results_btn = gr.Button("Back to Results")

	# Navigation logic
	option_a_btn.click(go_to_input, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	option_b_btn.click(
	lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), [["", CHATBOT_INITIAL_MESSAGE]]),
	inputs=None,
	outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page, state]
	)
	next_btn.click(go_to_analysis, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page, batch_page, top3_page])
	back_btn.click(go_to_input, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page, batch_page, top3_page])
	back_to_start_btn.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	back_to_start_btn2.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	back_to_start_btn3.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	back_to_start_btn4.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	go_to_batch_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	back_to_results_from_batch_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
	back_to_results_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])

	# Keyword and repo input logic
	keyword_btn.click(keyword_search_and_update, inputs=keyword_input, outputs=df_box)
	submit_btn.click(process_repo_input_and_store, inputs=repo_id_box, outputs=df_box)

	# Analysis logic
	combine_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt, llm_output_txt, df_display])

	# Chatbot logic
	def user_send(user_message, history):
	assistant_reply = chat_with_user(user_message, history)
	history = history + [[user_message, assistant_reply]]
	return history, history, ""

	def end_chat(history):
	keywords = extract_keywords_from_conversation(history)
	global generated_keywords
	generated_keywords.clear()
	generated_keywords.extend([k.strip() for k in keywords.split(",") if k.strip()])
	return keywords

	def go_to_results_from_chatbot(keywords):
	# Use the keywords to search and update the CSV, then display the DataFrame
	df = use_keywords_to_search_and_update_csv(keywords)
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), df

	send_btn.click(user_send, inputs=[user_input, state], outputs=[chatbot, state, user_input])
	end_btn.click(end_chat, inputs=state, outputs=keywords_output)
	go_to_results_btn.click(
	go_to_results_from_chatbot,
	inputs=keywords_output,
	outputs=[chatbot_page, input_page, analysis_page, results_page, batch_page, top3_page, results_df]
	)

	# Add logic for the new button on results_page
	analyze_next_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt_results, llm_output_txt_results, results_df])
	batch_btn.click(batch_analyze_and_select_top, inputs=None, outputs=[batch_info_txt, top3_txt, df_output])
	show_top3_chat_btn.click(batch_analyze_and_select_top_for_chat, inputs=[state], outputs=[state])

	def show_top3_page():
	# Run batch analysis, get top 3, save to CSV, and return DataFrame
	all_info_str, top3_str, df = batch_analyze_and_select_top()
	import pandas as pd
	import ast
	try:
	top3_ids = ast.literal_eval(top3_str)
	if isinstance(top3_ids, str):
	top3_ids = [top3_ids]
	except Exception:
	top3_ids = []
	top3_df_data = pd.DataFrame({"repo id": top3_ids})
	top3_df_data.to_csv("top3_repos.csv", index=False)
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), top3_df_data

	show_top3_page_btn.click(show_top3_page, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page, top3_df])

	demo.launch()