naman1102 commited on
Commit
9d332ff
·
1 Parent(s): adcb6a8
Files changed (3) hide show
  1. app.py +10 -1
  2. app_old.py +0 -481
  3. hf_utils.py +4 -0
app.py CHANGED
@@ -977,6 +977,15 @@ def create_ui() -> gr.Blocks:
977
  # Get final updated dataframe
978
  updated_df = read_csv_to_dataframe()
979
 
 
 
 
 
 
 
 
 
 
980
  # Get top 3 most relevant repositories using full data
981
  top_repos = get_top_relevant_repos(updated_df, user_requirements, top_n=3)
982
 
@@ -993,7 +1002,7 @@ def create_ui() -> gr.Blocks:
993
  show_top_section = gr.update(visible=not top_repos.empty)
994
 
995
  logger.info(f"Batch analysis completed: {successful_analyses} successful, {failed_analyses} failed, {csv_update_failures} CSV update issues")
996
- return format_dataframe_for_display(updated_df), final_status, format_dataframe_for_display(top_repos), show_top_section
997
 
998
  except Exception as e:
999
  logger.error(f"Error in batch analysis: {e}")
 
977
  # Get final updated dataframe
978
  updated_df = read_csv_to_dataframe()
979
 
980
+ # Filter out rows with no analysis data for consistent display with top 3
981
+ analyzed_df = updated_df.copy()
982
+ analyzed_df = analyzed_df[
983
+ (analyzed_df['strength'].str.strip() != '') |
984
+ (analyzed_df['weaknesses'].str.strip() != '') |
985
+ (analyzed_df['speciality'].str.strip() != '') |
986
+ (analyzed_df['relevance rating'].str.strip() != '')
987
+ ]
988
+
989
  # Get top 3 most relevant repositories using full data
990
  top_repos = get_top_relevant_repos(updated_df, user_requirements, top_n=3)
991
 
 
1002
  show_top_section = gr.update(visible=not top_repos.empty)
1003
 
1004
  logger.info(f"Batch analysis completed: {successful_analyses} successful, {failed_analyses} failed, {csv_update_failures} CSV update issues")
1005
+ return format_dataframe_for_display(analyzed_df), final_status, format_dataframe_for_display(top_repos), show_top_section
1006
 
1007
  except Exception as e:
1008
  logger.error(f"Error in batch analysis: {e}")
app_old.py DELETED
@@ -1,481 +0,0 @@
1
- import gradio as gr
2
- import regex as re
3
- import csv
4
- import pandas as pd
5
- from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
6
- from hf_utils import download_filtered_space_files, search_top_spaces
7
- from chatbot_page import chat_with_user, extract_keywords_from_conversation
8
- # Import chatbot logic
9
- from analyzer import analyze_code
10
-
11
- # Chatbot system prompt
12
- CHATBOT_SYSTEM_PROMPT = (
13
- "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
14
- "Ask questions to clarify what they want, their use case, preferred language, features, etc. "
15
- "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
16
- "Return only the keywords as a comma-separated list."
17
- )
18
-
19
- # Initial assistant message for chatbot
20
- CHATBOT_INITIAL_MESSAGE = "Hello! Please tell me about your ideal Hugging Face repo. What use case, preferred language, or features are you looking for?"
21
-
22
- def read_csv_as_text(csv_filename):
23
- return pd.read_csv(csv_filename, dtype=str)
24
-
25
- def process_repo_input(text):
26
- if not text:
27
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
28
- # Split by newlines and commas, strip whitespace
29
- repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
30
- # Write to CSV
31
- csv_filename = "repo_ids.csv"
32
- with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
33
- writer = csv.writer(csvfile)
34
- writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
35
- for repo_id in repo_ids:
36
- writer.writerow([repo_id, "", "", "", ""])
37
- # Read the CSV into a DataFrame to display
38
- df = read_csv_as_text(csv_filename)
39
- return df
40
-
41
- # Store the last entered repo ids and the current index in global variables for button access
42
- last_repo_ids = []
43
- current_repo_idx = 0
44
-
45
- # Store extracted keywords for the chatbot flow
46
- generated_keywords = []
47
-
48
- def process_repo_input_and_store(text):
49
- global last_repo_ids, current_repo_idx
50
- if not text:
51
- last_repo_ids = []
52
- current_repo_idx = 0
53
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
54
- repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
55
- last_repo_ids = repo_ids
56
- current_repo_idx = 0
57
- csv_filename = "repo_ids.csv"
58
- with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
59
- writer = csv.writer(csvfile)
60
- writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
61
- for repo_id in repo_ids:
62
- writer.writerow([repo_id, "", "", "", ""])
63
- df = read_csv_as_text(csv_filename)
64
- return df
65
-
66
- def keyword_search_and_update(keyword):
67
- global last_repo_ids, current_repo_idx
68
- if not keyword:
69
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
70
- # Accept multiple keywords, comma or newline separated
71
- keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()]
72
- repo_ids = []
73
- for kw in keyword_list:
74
- repo_ids.extend(search_top_spaces(kw, limit=5))
75
- # Remove duplicates while preserving order
76
- seen = set()
77
- unique_repo_ids = []
78
- for rid in repo_ids:
79
- if rid not in seen:
80
- unique_repo_ids.append(rid)
81
- seen.add(rid)
82
- last_repo_ids = unique_repo_ids
83
- current_repo_idx = 0
84
- csv_filename = "repo_ids.csv"
85
- with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
86
- writer = csv.writer(csvfile)
87
- writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
88
- for repo_id in unique_repo_ids:
89
- writer.writerow([repo_id, "", "", "", ""])
90
- df = read_csv_as_text(csv_filename)
91
- return df
92
-
93
- def show_combined_repo_and_llm():
94
- global current_repo_idx
95
- if not last_repo_ids:
96
- return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame()
97
- if current_repo_idx >= len(last_repo_ids):
98
- return "All repo IDs have been processed.", "", read_csv_as_text("repo_ids.csv")
99
- repo_id = last_repo_ids[current_repo_idx]
100
- try:
101
- download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=[".py", ".md", ".txt"])
102
- except Exception as e:
103
- return f"Error downloading repo: {e}", "", read_csv_as_text("repo_ids.csv")
104
- txt_path = combine_repo_files_for_llm()
105
- try:
106
- with open(txt_path, "r", encoding="utf-8") as f:
107
- combined_content = f.read()
108
- except Exception as e:
109
- return f"Error reading {txt_path}: {e}", "", read_csv_as_text("repo_ids.csv")
110
- llm_output = analyze_combined_file(txt_path)
111
- # Extract only the last JSON object (final summary) for CSV writing
112
- last_start = llm_output.rfind('{')
113
- last_end = llm_output.rfind('}')
114
- if last_start != -1 and last_end != -1 and last_end > last_start:
115
- final_json_str = llm_output[last_start:last_end+1]
116
- else:
117
- final_json_str = llm_output
118
- llm_json = parse_llm_json_response(final_json_str)
119
- # Update CSV for the current repo id
120
- csv_filename = "repo_ids.csv"
121
- extraction_status = ""
122
- strengths = ""
123
- weaknesses = ""
124
- try:
125
- df = read_csv_as_text(csv_filename)
126
- for col in ["strength", "weaknesses", "speciality", "relevance rating"]:
127
- df[col] = df[col].astype(str)
128
- updated = False
129
- for idx, row in df.iterrows():
130
- if row["repo id"] == repo_id:
131
- if isinstance(llm_json, dict) and "error" not in llm_json:
132
- extraction_status = "JSON extraction: SUCCESS"
133
- strengths = llm_json.get("strength", "")
134
- weaknesses = llm_json.get("weaknesses", "")
135
- df.at[idx, "strength"] = strengths
136
- df.at[idx, "weaknesses"] = weaknesses
137
- df.at[idx, "speciality"] = llm_json.get("speciality", "")
138
- df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
139
- updated = True
140
- else:
141
- extraction_status = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}"
142
- break
143
- # If not updated (repo_id not found), append a new row
144
- if not updated and isinstance(llm_json, dict) and "error" not in llm_json:
145
- extraction_status = "JSON extraction: SUCCESS (new row)"
146
- strengths = llm_json.get("strength", "")
147
- weaknesses = llm_json.get("weaknesses", "")
148
- new_row = {
149
- "repo id": repo_id,
150
- "strength": strengths,
151
- "weaknesses": weaknesses,
152
- "speciality": llm_json.get("speciality", ""),
153
- "relevance rating": llm_json.get("relevance rating", "")
154
- }
155
- df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
156
- df.to_csv(csv_filename, index=False)
157
- except Exception as e:
158
- df = read_csv_as_text(csv_filename)
159
- extraction_status = f"CSV update error: {e}"
160
- # Move to next repo for next click
161
- current_repo_idx += 1
162
- summary = f"{extraction_status}\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
163
- return combined_content, summary, df
164
-
165
- def go_to_analysis():
166
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
167
-
168
- def go_to_input():
169
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
170
-
171
- def go_to_chatbot():
172
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
173
-
174
- def go_to_start():
175
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
176
-
177
- def go_to_results():
178
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
179
-
180
- repo_id_input = gr.Textbox(label="Enter repo IDs (comma or newline separated)", lines=5, placeholder="repo1, repo2\nrepo3")
181
- df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
182
- datatype=["str", "str", "str", "str", "str", "str"]
183
- )
184
-
185
-
186
-
187
-
188
- def use_keywords_to_search_and_update_csv(keywords):
189
- global last_repo_ids, current_repo_idx
190
- if not keywords:
191
- return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
192
- # Split keywords and search for each
193
- keyword_list = [k.strip() for k in keywords.split(",") if k.strip()]
194
- repo_ids = []
195
- for kw in keyword_list:
196
- repo_ids.extend(search_top_spaces(kw, limit=3)) # limit=3 per keyword
197
- # Remove duplicates while preserving order
198
- seen = set()
199
- unique_repo_ids = []
200
- for rid in repo_ids:
201
- if rid not in seen:
202
- unique_repo_ids.append(rid)
203
- seen.add(rid)
204
- last_repo_ids = unique_repo_ids
205
- current_repo_idx = 0
206
- csv_filename = "repo_ids.csv"
207
- with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile:
208
- writer = csv.writer(csvfile)
209
- writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
210
- for repo_id in unique_repo_ids:
211
- writer.writerow([repo_id, "", "", "", ""])
212
- df = read_csv_as_text(csv_filename)
213
- return df
214
-
215
- def batch_analyze_and_select_top():
216
- csv_filename = "repo_ids.csv"
217
- try:
218
- df = read_csv_as_text(csv_filename)
219
- all_infos = []
220
- # Analyze each repo and update CSV
221
- for idx, row in df.iterrows():
222
- repo_id = row["repo id"]
223
- try:
224
- download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=[".py", ".md", ".txt"])
225
- txt_path = combine_repo_files_for_llm()
226
- llm_output = analyze_combined_file(txt_path)
227
- last_start = llm_output.rfind('{')
228
- last_end = llm_output.rfind('}')
229
- if last_start != -1 and last_end != -1 and last_end > last_start:
230
- final_json_str = llm_output[last_start:last_end+1]
231
- else:
232
- final_json_str = llm_output
233
- llm_json = parse_llm_json_response(final_json_str)
234
- if isinstance(llm_json, dict) and "error" not in llm_json:
235
- df.at[idx, "strength"] = llm_json.get("strength", "")
236
- df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "")
237
- df.at[idx, "speciality"] = llm_json.get("speciality", "")
238
- df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
239
- all_infos.append({"repo id": repo_id, **llm_json})
240
- except Exception as e:
241
- all_infos.append({"repo id": repo_id, "error": str(e)})
242
- df.to_csv(csv_filename, index=False)
243
- # Display all info
244
- all_info_str = "\n\n".join([str(info) for info in all_infos])
245
- # Let LLM choose the best 3
246
- from openai import OpenAI
247
- import os
248
- client = OpenAI(api_key=os.getenv("modal_api"))
249
- client.base_url = os.getenv("base_url")
250
- selection_prompt = (
251
- "You are a helpful assistant. You are given a list of repo analyses in JSON format. "
252
- "Choose the 3 repos that are the most impressive, relevant, or useful. "
253
- "Return ONLY a JSON array of the 3 best repo ids, in order of preference, under the key 'top_repos'. "
254
- "Example: {\"top_repos\": [\"repo1\", \"repo2\", \"repo3\"]}"
255
- )
256
- user_content = "Here are the repo analyses:\n" + all_info_str
257
- response = client.chat.completions.create(
258
- model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
259
- messages=[
260
- {"role": "system", "content": selection_prompt},
261
- {"role": "user", "content": user_content}
262
- ],
263
- max_tokens=256,
264
- temperature=0.3
265
- )
266
- selection_json = parse_llm_json_response(response.choices[0].message.content)
267
- top_repos = selection_json.get("top_repos", [])
268
- return all_info_str, str(top_repos), df
269
- except Exception as e:
270
- return f"Error in batch analysis: {e}", "", pd.DataFrame()
271
-
272
- def batch_analyze_and_select_top_for_chat(state):
273
- csv_filename = "repo_ids.csv"
274
- try:
275
- df = read_csv_as_text(csv_filename)
276
- all_infos = []
277
- for idx, row in df.iterrows():
278
- repo_id = row["repo id"]
279
- try:
280
- download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=[".py", ".md", ".txt"])
281
- txt_path = combine_repo_files_for_llm()
282
- llm_output = analyze_combined_file(txt_path)
283
- last_start = llm_output.rfind('{')
284
- last_end = llm_output.rfind('}')
285
- if last_start != -1 and last_end != -1 and last_end > last_start:
286
- final_json_str = llm_output[last_start:last_end+1]
287
- else:
288
- final_json_str = llm_output
289
- llm_json = parse_llm_json_response(final_json_str)
290
- if isinstance(llm_json, dict) and "error" not in llm_json:
291
- df.at[idx, "strength"] = llm_json.get("strength", "")
292
- df.at[idx, "weaknesses"] = llm_json.get("weaknesses", "")
293
- df.at[idx, "speciality"] = llm_json.get("speciality", "")
294
- df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "")
295
- all_infos.append({"repo id": repo_id, **llm_json})
296
- except Exception as e:
297
- all_infos.append({"repo id": repo_id, "error": str(e)})
298
- df.to_csv(csv_filename, index=False)
299
- all_info_str = "\n\n".join([str(info) for info in all_infos])
300
- from openai import OpenAI
301
- import os
302
- client = OpenAI(api_key=os.getenv("modal_api"))
303
- client.base_url = os.getenv("base_url")
304
- selection_prompt = (
305
- "You are a helpful assistant. You are given a list of repo analyses in JSON format. "
306
- "Choose the 3 repos that are the most impressive, relevant, or useful. "
307
- "Return ONLY a JSON array of the 3 best repo ids, in order of preference, under the key 'top_repos'. "
308
- "Example: {\"top_repos\": [\"repo1\", \"repo2\", \"repo3\"]}"
309
- )
310
- user_content = "Here are the repo analyses:\n" + all_info_str
311
- response = client.chat.completions.create(
312
- model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
313
- messages=[
314
- {"role": "system", "content": selection_prompt},
315
- {"role": "user", "content": user_content}
316
- ],
317
- max_tokens=256,
318
- temperature=0.3
319
- )
320
- selection_json = parse_llm_json_response(response.choices[0].message.content)
321
- top_repos = selection_json.get("top_repos", [])
322
- # Add a new assistant message to the chat state
323
- new_message = ("", f"The top 3 repo IDs are: {', '.join(top_repos)}")
324
- if state is None:
325
- state = []
326
- state = state + [list(new_message)]
327
- return state
328
- except Exception as e:
329
- new_message = ("", f"Error in batch analysis: {e}")
330
- if state is None:
331
- state = []
332
- state = state + [list(new_message)]
333
- return state
334
-
335
- with gr.Blocks() as demo:
336
- page_state = gr.State(0)
337
-
338
- # --- Start Page: Option Selection ---
339
- with gr.Column(visible=True) as start_page:
340
- gr.Markdown("## Welcome! How would you like to proceed?")
341
- option_a_btn = gr.Button("A) I know which repos I want to search and research about")
342
- option_b_btn = gr.Button("B) I don't know exactly what I want (Chatbot)")
343
-
344
- # --- Page 1: Input ---
345
- with gr.Column(visible=False) as input_page:
346
- gr.Markdown("## Enter Keyword or Repo IDs")
347
- keyword_input = gr.Textbox(label="Enter keywords to search repos (comma or newline separated)", lines=2, placeholder="e.g. audio, vision\ntext")
348
- keyword_btn = gr.Button("Search and Update Repo List")
349
- repo_id_box = repo_id_input.render()
350
- df_box = df_output.render()
351
- submit_btn = gr.Button("Submit Repo IDs")
352
- next_btn = gr.Button("Next: Go to Analysis")
353
- back_to_start_btn = gr.Button("Back to Start")
354
-
355
- # --- Page 2: Analysis ---
356
- with gr.Column(visible=False) as analysis_page:
357
- gr.Markdown("## Combine and Display Repo Files")
358
- combine_btn = gr.Button("Download, Combine & Show .py/.md Files from Next Repo and Analyze")
359
- combined_txt = gr.Textbox(label="Combined Repo Files", lines=20)
360
- llm_output_txt = gr.Textbox(label="LLM Analysis Output", lines=10)
361
- df_display = gr.Dataframe(
362
- headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
363
- datatype=["str", "str", "str", "str", "str", "str"]
364
- )
365
- back_btn = gr.Button("Back to Input")
366
- back_to_start_btn2 = gr.Button("Back to Start")
367
-
368
- # --- Page 3: Chatbot ---
369
- with gr.Column(visible=False) as chatbot_page:
370
- gr.Markdown("## Repo Recommendation Chatbot")
371
- chatbot = gr.Chatbot()
372
- state = gr.State([])
373
- user_input = gr.Textbox(label="Your message", placeholder="Describe your ideal repo or answer the assistant's questions...")
374
- send_btn = gr.Button("Send")
375
- end_btn = gr.Button("End Chat and Extract Keywords")
376
- keywords_output = gr.Textbox(label="Extracted Keywords for Repo Search", interactive=False)
377
- go_to_results_btn = gr.Button("Find Repos with These Keywords")
378
- back_to_start_btn3 = gr.Button("Back to Start")
379
-
380
- # --- Page 4: Results after Chatbot ---
381
- with gr.Column(visible=False) as results_page:
382
- gr.Markdown("## Repo Results Based on Your Conversation")
383
- results_df = gr.Dataframe(
384
- headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"],
385
- datatype=["str", "str", "str", "str", "str", "str"]
386
- )
387
- analyze_next_btn = gr.Button("Download, Combine & Analyze Next Repo")
388
- combined_txt_results = gr.Textbox(label="Combined Repo Files", lines=20)
389
- llm_output_txt_results = gr.Textbox(label="LLM Analysis Output", lines=10)
390
- back_to_start_btn4 = gr.Button("Back to Start")
391
- go_to_batch_btn = gr.Button("Go to Batch Analysis Page", visible=True)
392
-
393
- # --- Page 5: Batch Analysis Page ---
394
- with gr.Column(visible=False) as batch_page:
395
- gr.Markdown("## Batch Analysis & Top 3 Selection")
396
- batch_btn = gr.Button("Batch Analyze All & Select Top 3", visible=True)
397
- batch_info_txt = gr.Textbox(label="All Repo Analyses", lines=10)
398
- top3_txt = gr.Textbox(label="Top 3 Repo IDs", lines=1)
399
- show_top3_chat_btn = gr.Button("Show Top 3 Repo IDs in Chat", visible=True)
400
- show_top3_page_btn = gr.Button("Show Top 3 Repos on New Page", visible=True)
401
- back_to_results_from_batch_btn = gr.Button("Back to Results")
402
-
403
- # --- Page 6: Top 3 Repos Page ---
404
- with gr.Column(visible=False) as top3_page:
405
- gr.Markdown("## Top 3 Recommended Repos")
406
- top3_df = gr.Dataframe(headers=["repo id"], datatype=["str"])
407
- back_to_results_btn = gr.Button("Back to Results")
408
-
409
- # Navigation logic
410
- option_a_btn.click(go_to_input, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
411
- option_b_btn.click(
412
- lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), [["", CHATBOT_INITIAL_MESSAGE]]),
413
- inputs=None,
414
- outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page, state]
415
- )
416
- next_btn.click(go_to_analysis, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page, batch_page, top3_page])
417
- back_btn.click(go_to_input, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page, batch_page, top3_page])
418
- back_to_start_btn.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
419
- back_to_start_btn2.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
420
- back_to_start_btn3.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
421
- back_to_start_btn4.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
422
- go_to_batch_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
423
- back_to_results_from_batch_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
424
- back_to_results_btn.click(lambda: (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)), inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page])
425
-
426
- # Keyword and repo input logic
427
- keyword_btn.click(keyword_search_and_update, inputs=keyword_input, outputs=df_box)
428
- submit_btn.click(process_repo_input_and_store, inputs=repo_id_box, outputs=df_box)
429
-
430
- # Analysis logic
431
- combine_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt, llm_output_txt, df_display])
432
-
433
- # Chatbot logic
434
- def user_send(user_message, history):
435
- assistant_reply = chat_with_user(user_message, history)
436
- history = history + [[user_message, assistant_reply]]
437
- return history, history, ""
438
-
439
- def end_chat(history):
440
- keywords = extract_keywords_from_conversation(history)
441
- global generated_keywords
442
- generated_keywords.clear()
443
- generated_keywords.extend([k.strip() for k in keywords.split(",") if k.strip()])
444
- return keywords
445
-
446
- def go_to_results_from_chatbot(keywords):
447
- # Use the keywords to search and update the CSV, then display the DataFrame
448
- df = use_keywords_to_search_and_update_csv(keywords)
449
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), df
450
-
451
- send_btn.click(user_send, inputs=[user_input, state], outputs=[chatbot, state, user_input])
452
- end_btn.click(end_chat, inputs=state, outputs=keywords_output)
453
- go_to_results_btn.click(
454
- go_to_results_from_chatbot,
455
- inputs=keywords_output,
456
- outputs=[chatbot_page, input_page, analysis_page, results_page, batch_page, top3_page, results_df]
457
- )
458
-
459
- # Add logic for the new button on results_page
460
- analyze_next_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt_results, llm_output_txt_results, results_df])
461
- batch_btn.click(batch_analyze_and_select_top, inputs=None, outputs=[batch_info_txt, top3_txt, df_output])
462
- show_top3_chat_btn.click(batch_analyze_and_select_top_for_chat, inputs=[state], outputs=[state])
463
-
464
- def show_top3_page():
465
- # Run batch analysis, get top 3, save to CSV, and return DataFrame
466
- all_info_str, top3_str, df = batch_analyze_and_select_top()
467
- import pandas as pd
468
- import ast
469
- try:
470
- top3_ids = ast.literal_eval(top3_str)
471
- if isinstance(top3_ids, str):
472
- top3_ids = [top3_ids]
473
- except Exception:
474
- top3_ids = []
475
- top3_df_data = pd.DataFrame({"repo id": top3_ids})
476
- top3_df_data.to_csv("top3_repos.csv", index=False)
477
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), top3_df_data
478
-
479
- show_top3_page_btn.click(show_top3_page, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page, batch_page, top3_page, top3_df])
480
-
481
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_utils.py CHANGED
@@ -35,6 +35,10 @@ def download_filtered_space_files(space_id: str, local_dir: str = "repo_files",
35
  rel_path = os.path.relpath(src_file, repo_path)
36
  dest_file = os.path.join(local_dir, rel_path)
37
  os.makedirs(os.path.dirname(dest_file), exist_ok=True)
 
 
 
 
38
  shutil.copy2(src_file, dest_file)
39
  copied_files += 1
40
 
 
35
  rel_path = os.path.relpath(src_file, repo_path)
36
  dest_file = os.path.join(local_dir, rel_path)
37
  os.makedirs(os.path.dirname(dest_file), exist_ok=True)
38
+
39
+ # Debug: Show exactly which file is being downloaded
40
+ print(f"DEBUG: Downloading file: {rel_path}")
41
+
42
  shutil.copy2(src_file, dest_file)
43
  copied_files += 1
44