rohansampath commited on
Commit
30e6a06
·
verified ·
1 Parent(s): e6f8dd1

Update run_evaluation.py

Browse files
Files changed (1) hide show
  1. run_evaluation.py +24 -27
run_evaluation.py CHANGED
@@ -119,37 +119,34 @@ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
119
  comparison_df = pd.DataFrame(comparison_data)
120
 
121
  # Format the report
122
- report = (
123
- f"### Head-to-Head Comparison Results\n\n"
124
- f"#### Model 1: {model1_config['name']}\n"
125
- f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
126
- f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
127
- f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
128
- f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
129
- f"#### Model 2: {model2_config['name']}\n"
130
- f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
131
- f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
132
- f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
133
- f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
134
- f"#### Overall Winner: {overall_winner}\n"
135
- f"* Margin: {abs(overall_diff):.3f}\n"
136
  )
137
-
138
- # Return values that re-enable UI components after completion
139
- return {
140
- 'report': report,
141
- 'comparison_df': comparison_df,
142
- 'success': True
143
- }
144
 
145
  except Exception as e:
146
  # Handle errors gracefully
147
  error_trace = traceback.format_exc()
148
  error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
149
 
150
- # Return error information
151
- return {
152
- 'report': error_message,
153
- 'comparison_df': None,
154
- 'success': False
155
- }
 
 
 
 
 
 
 
 
119
  comparison_df = pd.DataFrame(comparison_data)
120
 
121
  # Format the report
122
+ return (
123
+ report, # Report markdown
124
+ comparison_df, # Results dataframe
125
+ gr.Button.update(interactive=True), # Enable eval button
126
+ gr.Button.update(interactive=False), # Disable cancel button
127
+ gr.Radio.update(interactive=True), # Enable subject selection mode
128
+ gr.Slider.update(interactive=True), # Enable subjects slider
129
+ gr.Checkbox.update(interactive=True), # Enable all questions checkbox
130
+ gr.Slider.update(interactive=True), # Enable questions slider
131
+ gr.Dropdown.update(interactive=True), # Enable model dropdown
132
+ gr.Column.update(visible=True) # Make table container visible
 
 
 
133
  )
 
 
 
 
 
 
 
134
 
135
  except Exception as e:
136
  # Handle errors gracefully
137
  error_trace = traceback.format_exc()
138
  error_message = f"### Error during evaluation\n```\n{error_trace}\n```"
139
 
140
+ # Return error values for all 10 components
141
+ return (
142
+ error_message, # Error message in markdown
143
+ None, # No dataframe
144
+ gr.Button.update(interactive=True), # Enable eval button
145
+ gr.Button.update(interactive=False), # Disable cancel button
146
+ gr.Radio.update(interactive=True), # Enable subject selection mode
147
+ gr.Slider.update(interactive=True), # Enable subjects slider
148
+ gr.Checkbox.update(interactive=True), # Enable all questions checkbox
149
+ gr.Slider.update(interactive=True), # Enable questions slider
150
+ gr.Dropdown.update(interactive=True), # Enable model dropdown
151
+ gr.Column.update(visible=False) # Hide table container
152
+ )