naman1102 commited on
Commit
3330689
Β·
1 Parent(s): 6a5d12d
Files changed (2) hide show
  1. analyzer.py +27 -8
  2. app.py +53 -15
analyzer.py CHANGED
@@ -14,10 +14,11 @@ def analyze_code(code: str) -> str:
14
  system_prompt = (
15
  "You are a highly precise and strict JSON generator. Analyze the code given to you. "
16
  "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
 
17
  "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
18
  "If you cannot answer, still return a valid JSON with empty strings for each key. "
19
  "Example of the ONLY valid output:\n"
20
- "{\n 'strength': '...', \n 'weaknesses': '...', \n 'speciality': '...', \n 'relevance rating': '...'\n}"
21
  )
22
  response = client.chat.completions.create(
23
  model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", # Updated model
@@ -108,22 +109,31 @@ def combine_repo_files_for_llm(repo_dir="repo_files", output_file="combined_repo
108
  out_f.write("\n".join(combined_content))
109
  return output_file
110
 
111
- def analyze_code_chunk(code: str) -> str:
112
  """
113
  Analyzes a code chunk and returns a JSON summary for that chunk.
114
  """
115
  from openai import OpenAI
116
  client = OpenAI(api_key=os.getenv("modal_api"))
117
  client.base_url = os.getenv("base_url")
 
 
 
 
 
 
118
  chunk_prompt = (
119
  "You are a highly precise and strict JSON generator. Analyze the following code chunk. "
120
  "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
121
  "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
 
122
  "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
123
  "If you cannot answer, still return a valid JSON with empty strings for each key. "
 
124
  "Example of the ONLY valid output:\n"
125
- '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "..."\n}'
126
  )
 
127
  response = client.chat.completions.create(
128
  model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
129
  messages=[
@@ -135,21 +145,29 @@ def analyze_code_chunk(code: str) -> str:
135
  )
136
  return response.choices[0].message.content
137
 
138
- def aggregate_chunk_analyses(chunk_jsons: list) -> str:
139
  """
140
  Aggregates a list of chunk JSONs into a single JSON summary using the LLM.
141
  """
142
  from openai import OpenAI
143
  client = OpenAI(api_key=os.getenv("modal_api"))
144
  client.base_url = os.getenv("base_url")
 
 
 
 
 
 
145
  aggregation_prompt = (
146
  "You are a highly precise and strict, code analyzer and JSON generator. You are given a list of JSON analyses of code chunks. "
147
  "Aggregate these into a SINGLE overall JSON summary with the same keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
148
  "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
 
149
  "Summarize and combine the information from all chunks. Do NOT include any explanation, markdown, or text outside the JSON. "
150
  "If a key is missing in all chunks, use an empty string. "
 
151
  "Example of the ONLY valid output:\n"
152
- '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "..."\n}'
153
  )
154
  user_content = "Here are the chunk analyses:\n" + "\n".join(chunk_jsons)
155
  response = client.chat.completions.create(
@@ -163,9 +181,10 @@ def aggregate_chunk_analyses(chunk_jsons: list) -> str:
163
  )
164
  return response.choices[0].message.content
165
 
166
- def analyze_combined_file(output_file="combined_repo.txt"):
167
  """
168
  Reads the combined file, splits it into 500-line chunks, analyzes each chunk, and aggregates the LLM's output into a final summary.
 
169
  Returns the chunk JSONs (for debugging) and the aggregated analysis as a string.
170
  """
171
  try:
@@ -175,9 +194,9 @@ def analyze_combined_file(output_file="combined_repo.txt"):
175
  chunk_jsons = []
176
  for i in range(0, len(lines), chunk_size):
177
  chunk = "".join(lines[i:i+chunk_size])
178
- analysis = analyze_code_chunk(chunk)
179
  chunk_jsons.append(analysis)
180
- final_summary = aggregate_chunk_analyses(chunk_jsons)
181
  debug_output = (
182
  "==== Chunk JSON Outputs ===="
183
  + "\n\n".join([f"Chunk {i+1} JSON:\n{chunk_jsons[i]}" for i in range(len(chunk_jsons))])
 
14
  system_prompt = (
15
  "You are a highly precise and strict JSON generator. Analyze the code given to you. "
16
  "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
17
+ "For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
18
  "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
19
  "If you cannot answer, still return a valid JSON with empty strings for each key. "
20
  "Example of the ONLY valid output:\n"
21
+ "{\n 'strength': '...', \n 'weaknesses': '...', \n 'speciality': '...', \n 'relevance rating': 'high'\n}"
22
  )
23
  response = client.chat.completions.create(
24
  model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", # Updated model
 
109
  out_f.write("\n".join(combined_content))
110
  return output_file
111
 
112
+ def analyze_code_chunk(code: str, user_requirements: str = "") -> str:
113
  """
114
  Analyzes a code chunk and returns a JSON summary for that chunk.
115
  """
116
  from openai import OpenAI
117
  client = OpenAI(api_key=os.getenv("modal_api"))
118
  client.base_url = os.getenv("base_url")
119
+
120
+ # Build the user requirements section
121
+ requirements_section = ""
122
+ if user_requirements.strip():
123
+ requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen rating relevance, consider how well this code matches the user's stated requirements."
124
+
125
  chunk_prompt = (
126
  "You are a highly precise and strict JSON generator. Analyze the following code chunk. "
127
  "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
128
  "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
129
+ "For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
130
  "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
131
  "If you cannot answer, still return a valid JSON with empty strings for each key. "
132
+ f"{requirements_section}"
133
  "Example of the ONLY valid output:\n"
134
+ '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}'
135
  )
136
+
137
  response = client.chat.completions.create(
138
  model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
139
  messages=[
 
145
  )
146
  return response.choices[0].message.content
147
 
148
+ def aggregate_chunk_analyses(chunk_jsons: list, user_requirements: str = "") -> str:
149
  """
150
  Aggregates a list of chunk JSONs into a single JSON summary using the LLM.
151
  """
152
  from openai import OpenAI
153
  client = OpenAI(api_key=os.getenv("modal_api"))
154
  client.base_url = os.getenv("base_url")
155
+
156
+ # Build the user requirements section
157
+ requirements_section = ""
158
+ if user_requirements.strip():
159
+ requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen aggregating the relevance rating, consider how well the overall repository matches the user's stated requirements."
160
+
161
  aggregation_prompt = (
162
  "You are a highly precise and strict, code analyzer and JSON generator. You are given a list of JSON analyses of code chunks. "
163
  "Aggregate these into a SINGLE overall JSON summary with the same keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
164
  "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
165
+ "For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
166
  "Summarize and combine the information from all chunks. Do NOT include any explanation, markdown, or text outside the JSON. "
167
  "If a key is missing in all chunks, use an empty string. "
168
+ f"{requirements_section}"
169
  "Example of the ONLY valid output:\n"
170
+ '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}'
171
  )
172
  user_content = "Here are the chunk analyses:\n" + "\n".join(chunk_jsons)
173
  response = client.chat.completions.create(
 
181
  )
182
  return response.choices[0].message.content
183
 
184
+ def analyze_combined_file(output_file="combined_repo.txt", user_requirements: str = ""):
185
  """
186
  Reads the combined file, splits it into 500-line chunks, analyzes each chunk, and aggregates the LLM's output into a final summary.
187
+ Now includes user requirements for better relevance rating.
188
  Returns the chunk JSONs (for debugging) and the aggregated analysis as a string.
189
  """
190
  try:
 
194
  chunk_jsons = []
195
  for i in range(0, len(lines), chunk_size):
196
  chunk = "".join(lines[i:i+chunk_size])
197
+ analysis = analyze_code_chunk(chunk, user_requirements)
198
  chunk_jsons.append(analysis)
199
+ final_summary = aggregate_chunk_analyses(chunk_jsons, user_requirements)
200
  debug_output = (
201
  "==== Chunk JSON Outputs ===="
202
  + "\n\n".join([f"Chunk {i+1} JSON:\n{chunk_jsons[i]}" for i in range(len(chunk_jsons))])
app.py CHANGED
@@ -48,9 +48,10 @@ def read_csv_to_dataframe() -> pd.DataFrame:
48
  logger.error(f"Error reading CSV: {e}")
49
  return pd.DataFrame()
50
 
51
- def analyze_and_update_single_repo(repo_id: str) -> Tuple[str, str, pd.DataFrame]:
52
  """
53
  Downloads, analyzes a single repo, updates the CSV, and returns results.
 
54
  This function combines the logic of downloading, analyzing, and updating the CSV for one repo.
55
  """
56
  try:
@@ -61,7 +62,7 @@ def analyze_and_update_single_repo(repo_id: str) -> Tuple[str, str, pd.DataFrame
61
  with open(txt_path, "r", encoding="utf-8") as f:
62
  combined_content = f.read()
63
 
64
- llm_output = analyze_combined_file(txt_path)
65
 
66
  last_start = llm_output.rfind('{')
67
  last_end = llm_output.rfind('}')
@@ -73,7 +74,8 @@ def analyze_and_update_single_repo(repo_id: str) -> Tuple[str, str, pd.DataFrame
73
  if isinstance(llm_json, dict) and "error" not in llm_json:
74
  strengths = llm_json.get("strength", "N/A")
75
  weaknesses = llm_json.get("weaknesses", "N/A")
76
- summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
 
77
  else:
78
  summary = f"JSON extraction: FAILED\nRaw response might not be valid JSON."
79
 
@@ -128,7 +130,7 @@ def create_ui() -> gr.Blocks:
128
  /* Modern sleek design */
129
  .gradio-container {
130
  font-family: 'Inter', 'system-ui', sans-serif;
131
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
132
  min-height: 100vh;
133
  }
134
 
@@ -239,6 +241,7 @@ def create_ui() -> gr.Blocks:
239
  # Using simple, separate state objects for robustness.
240
  repo_ids_state = gr.State([])
241
  current_repo_idx_state = gr.State(0)
 
242
 
243
  gr.Markdown(
244
  """
@@ -284,6 +287,15 @@ def create_ui() -> gr.Blocks:
284
  with gr.TabItem("πŸ”¬ Analysis", id="analysis_tab"):
285
  gr.Markdown("### πŸ§ͺ Repository Analysis Engine")
286
 
 
 
 
 
 
 
 
 
 
287
  with gr.Row():
288
  analyze_next_btn = gr.Button("⚑ Analyze Next Repository", variant="primary", size="lg", scale=2)
289
  with gr.Column(scale=3):
@@ -396,7 +408,24 @@ def create_ui() -> gr.Blocks:
396
  status = f"Status: Found {len(unique_repo_ids)} repositories. Ready for analysis."
397
  return unique_repo_ids, 0, df, status, gr.update(selected="analysis_tab")
398
 
399
- def handle_analyze_next(repo_ids: List[str], current_idx: int) -> Tuple[str, str, pd.DataFrame, int, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  """Analyzes the next repository in the list."""
401
  if not repo_ids:
402
  return "", "", pd.DataFrame(), 0, "Status: No repositories to analyze. Please submit repo IDs first."
@@ -405,8 +434,10 @@ def create_ui() -> gr.Blocks:
405
 
406
  repo_id_to_analyze = repo_ids[current_idx]
407
  status = f"Status: Analyzing repository {current_idx + 1}/{len(repo_ids)}: {repo_id_to_analyze}"
 
 
408
 
409
- content, summary, df = analyze_and_update_single_repo(repo_id_to_analyze)
410
 
411
  next_idx = current_idx + 1
412
  if next_idx >= len(repo_ids):
@@ -437,15 +468,15 @@ def create_ui() -> gr.Blocks:
437
  history.append({"role": "assistant", "content": response})
438
  return history
439
 
440
- def handle_end_chat(history: List[Dict[str, str]]) -> Tuple[str, str]:
441
- """Ends the chat, extracts and sanitizes keywords from the conversation."""
442
  if not history:
443
- return "", "Status: Chat is empty, nothing to analyze."
444
 
445
  # Convert the full, valid history for the extraction logic
446
  tuple_history = convert_messages_to_tuples(history)
447
  if not tuple_history:
448
- return "", "Status: No completed conversations to analyze."
449
 
450
  # Get raw keywords string from the LLM
451
  raw_keywords_str = extract_keywords_from_conversation(tuple_history)
@@ -458,13 +489,16 @@ def create_ui() -> gr.Blocks:
458
  cleaned_keywords = [kw.strip() for kw in cleaned_keywords if kw.strip()]
459
 
460
  if not cleaned_keywords:
461
- return "", f"Status: Could not extract valid keywords. Raw LLM output: '{raw_keywords_str}'"
462
 
463
  # Join them into a clean, comma-separated string for the search tool
464
  final_keywords_str = ", ".join(cleaned_keywords)
465
 
466
- status = "Status: Keywords extracted. You can now use them to search."
467
- return final_keywords_str, status
 
 
 
468
 
469
  # --- Component Event Wiring ---
470
 
@@ -489,7 +523,7 @@ def create_ui() -> gr.Blocks:
489
  # Analysis Tab
490
  analyze_next_btn.click(
491
  fn=handle_analyze_next,
492
- inputs=[repo_ids_state, current_repo_idx_state],
493
  outputs=[content_output, summary_output, df_output, current_repo_idx_state, status_box_analysis]
494
  )
495
 
@@ -515,7 +549,11 @@ def create_ui() -> gr.Blocks:
515
  end_chat_btn.click(
516
  fn=handle_end_chat,
517
  inputs=[chatbot],
518
- outputs=[extracted_keywords_output, status_box_chatbot]
 
 
 
 
519
  )
520
  use_keywords_btn.click(
521
  fn=handle_keyword_search,
 
48
  logger.error(f"Error reading CSV: {e}")
49
  return pd.DataFrame()
50
 
51
+ def analyze_and_update_single_repo(repo_id: str, user_requirements: str = "") -> Tuple[str, str, pd.DataFrame]:
52
  """
53
  Downloads, analyzes a single repo, updates the CSV, and returns results.
54
+ Now includes user requirements for better relevance rating.
55
  This function combines the logic of downloading, analyzing, and updating the CSV for one repo.
56
  """
57
  try:
 
62
  with open(txt_path, "r", encoding="utf-8") as f:
63
  combined_content = f.read()
64
 
65
+ llm_output = analyze_combined_file(txt_path, user_requirements)
66
 
67
  last_start = llm_output.rfind('{')
68
  last_end = llm_output.rfind('}')
 
74
  if isinstance(llm_json, dict) and "error" not in llm_json:
75
  strengths = llm_json.get("strength", "N/A")
76
  weaknesses = llm_json.get("weaknesses", "N/A")
77
+ relevance = llm_json.get("relevance rating", "N/A")
78
+ summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}\n\nRelevance: {relevance}"
79
  else:
80
  summary = f"JSON extraction: FAILED\nRaw response might not be valid JSON."
81
 
 
130
  /* Modern sleek design */
131
  .gradio-container {
132
  font-family: 'Inter', 'system-ui', sans-serif;
133
+ background: linear-gradient(135deg, #0a0a0a 0%, #1a1a1a 100%);
134
  min-height: 100vh;
135
  }
136
 
 
241
  # Using simple, separate state objects for robustness.
242
  repo_ids_state = gr.State([])
243
  current_repo_idx_state = gr.State(0)
244
+ user_requirements_state = gr.State("") # Store user requirements from chatbot
245
 
246
  gr.Markdown(
247
  """
 
287
  with gr.TabItem("πŸ”¬ Analysis", id="analysis_tab"):
288
  gr.Markdown("### πŸ§ͺ Repository Analysis Engine")
289
 
290
+ # Display current user requirements
291
+ with gr.Row():
292
+ current_requirements_display = gr.Textbox(
293
+ label="πŸ“‹ Current User Requirements",
294
+ interactive=False,
295
+ lines=3,
296
+ info="Requirements extracted from AI chat conversation for relevance rating"
297
+ )
298
+
299
  with gr.Row():
300
  analyze_next_btn = gr.Button("⚑ Analyze Next Repository", variant="primary", size="lg", scale=2)
301
  with gr.Column(scale=3):
 
408
  status = f"Status: Found {len(unique_repo_ids)} repositories. Ready for analysis."
409
  return unique_repo_ids, 0, df, status, gr.update(selected="analysis_tab")
410
 
411
+ def extract_user_requirements_from_chat(history: List[Dict[str, str]]) -> str:
412
+ """Extract user requirements from chatbot conversation."""
413
+ if not history:
414
+ return ""
415
+
416
+ user_messages = []
417
+ for msg in history:
418
+ if msg.get('role') == 'user':
419
+ user_messages.append(msg.get('content', ''))
420
+
421
+ if not user_messages:
422
+ return ""
423
+
424
+ # Combine all user messages as requirements
425
+ requirements = "\n".join([f"- {msg}" for msg in user_messages if msg.strip()])
426
+ return requirements
427
+
428
+ def handle_analyze_next(repo_ids: List[str], current_idx: int, user_requirements: str) -> Tuple[str, str, pd.DataFrame, int, str]:
429
  """Analyzes the next repository in the list."""
430
  if not repo_ids:
431
  return "", "", pd.DataFrame(), 0, "Status: No repositories to analyze. Please submit repo IDs first."
 
434
 
435
  repo_id_to_analyze = repo_ids[current_idx]
436
  status = f"Status: Analyzing repository {current_idx + 1}/{len(repo_ids)}: {repo_id_to_analyze}"
437
+ if user_requirements.strip():
438
+ status += f"\nUsing user requirements for relevance rating."
439
 
440
+ content, summary, df = analyze_and_update_single_repo(repo_id_to_analyze, user_requirements)
441
 
442
  next_idx = current_idx + 1
443
  if next_idx >= len(repo_ids):
 
468
  history.append({"role": "assistant", "content": response})
469
  return history
470
 
471
+ def handle_end_chat(history: List[Dict[str, str]]) -> Tuple[str, str, str]:
472
+ """Ends the chat, extracts and sanitizes keywords from the conversation, and extracts user requirements."""
473
  if not history:
474
+ return "", "Status: Chat is empty, nothing to analyze.", ""
475
 
476
  # Convert the full, valid history for the extraction logic
477
  tuple_history = convert_messages_to_tuples(history)
478
  if not tuple_history:
479
+ return "", "Status: No completed conversations to analyze.", ""
480
 
481
  # Get raw keywords string from the LLM
482
  raw_keywords_str = extract_keywords_from_conversation(tuple_history)
 
489
  cleaned_keywords = [kw.strip() for kw in cleaned_keywords if kw.strip()]
490
 
491
  if not cleaned_keywords:
492
+ return "", f"Status: Could not extract valid keywords. Raw LLM output: '{raw_keywords_str}'", ""
493
 
494
  # Join them into a clean, comma-separated string for the search tool
495
  final_keywords_str = ", ".join(cleaned_keywords)
496
 
497
+ # Extract user requirements for analysis
498
+ user_requirements = extract_user_requirements_from_chat(history)
499
+
500
+ status = "Status: Keywords extracted. User requirements saved for analysis."
501
+ return final_keywords_str, status, user_requirements
502
 
503
  # --- Component Event Wiring ---
504
 
 
523
  # Analysis Tab
524
  analyze_next_btn.click(
525
  fn=handle_analyze_next,
526
+ inputs=[repo_ids_state, current_repo_idx_state, user_requirements_state],
527
  outputs=[content_output, summary_output, df_output, current_repo_idx_state, status_box_analysis]
528
  )
529
 
 
549
  end_chat_btn.click(
550
  fn=handle_end_chat,
551
  inputs=[chatbot],
552
+ outputs=[extracted_keywords_output, status_box_chatbot, user_requirements_state]
553
+ ).then(
554
+ fn=lambda req: req if req.strip() else "No specific requirements extracted from conversation.",
555
+ inputs=[user_requirements_state],
556
+ outputs=[current_requirements_display]
557
  )
558
  use_keywords_btn.click(
559
  fn=handle_keyword_search,