rohansampath commited on
Commit
671bd95
·
verified ·
1 Parent(s): 95f85dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -11
app.py CHANGED
@@ -7,6 +7,7 @@ from toy_dataset_eval import evaluate_toy_dataset
7
  from mmlu_eval_original import evaluate_mmlu_batched
8
  import spaces
9
  import pandas as pd
 
10
 
11
  # Read token and login
12
  hf_token = os.getenv("HF_TOKEN_READ_WRITE")
@@ -27,6 +28,7 @@ model_loaded = False
27
  def load_model():
28
  """Loads the Mistral model and tokenizer and updates the load status."""
29
  global tokenizer, model, model_loaded
 
30
  try:
31
  if tokenizer is None:
32
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
@@ -38,14 +40,15 @@ def load_model():
38
  )
39
  model.to('cuda')
40
  model_loaded = True
41
- return "✅ Model Loaded!"
 
42
  except Exception as e:
43
  model_loaded = False
44
  return f"❌ Model Load Failed: {str(e)}"
45
  # ---------------------------------------------------------------------------
46
  # 2. Toy Evaluation
47
  # ---------------------------------------------------------------------------
48
- @spaces.GPU (duration=120)
49
  def run_toy_evaluation():
50
  """Runs the toy dataset evaluation."""
51
  if not model_loaded:
@@ -54,8 +57,12 @@ def run_toy_evaluation():
54
  if not model_loaded:
55
  return "⚠️ Model not loaded. Please load the model first."
56
 
 
57
  results = evaluate_toy_dataset(model, tokenizer)
58
- return results # Ensure load confirmation is shown before results
 
 
 
59
 
60
  # ---------------------------------------------------------------------------
61
  # 3. MMLU Evaluation call
@@ -77,7 +84,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
77
  load_model()
78
 
79
  if not model_loaded:
80
- return "⚠️ Model not loaded. Please load the model first."
81
 
82
  # Convert num_subjects to -1 if all_subjects is True
83
  if all_subjects:
@@ -87,7 +94,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
87
  if all_questions:
88
  num_questions = -1
89
 
90
- # Run evaluation
 
91
  results = evaluate_mmlu(
92
  model,
93
  tokenizer,
@@ -96,6 +104,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
96
  num_shots=num_shots,
97
  auto_batch_size=True
98
  )
 
99
 
100
  # Format results
101
  overall_acc = results["overall_accuracy"]
@@ -105,17 +114,35 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
105
  # Create DataFrame from results table
106
  results_df = pd.DataFrame(results["full_accuracy_table"])
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # Format the report
109
  report = (
110
  f"### Overall Results\n"
111
  f"* Overall Accuracy: {overall_acc:.3f}\n"
112
  f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
113
- f"* Worst Performance: {min_subject} ({min_acc:.3f})\n\n"
114
- f"### Detailed Results Table\n"
115
- f"{results_df.to_markdown()}\n"
116
  )
117
 
118
- return report
119
 
120
  # ---------------------------------------------------------------------------
121
  # 4. Gradio Interface
@@ -186,6 +213,7 @@ with gr.Blocks() as demo:
186
  with gr.Row():
187
  eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
188
  results_output = gr.Markdown(label="Evaluation Results")
 
189
 
190
  # Connect components
191
  load_button.click(fn=load_model, inputs=None, outputs=load_status)
@@ -233,7 +261,7 @@ with gr.Blocks() as demo:
233
  all_questions_checkbox,
234
  num_questions_slider
235
  ],
236
- outputs=results_output
237
  )
238
 
239
- demo.launch()
 
7
  from mmlu_eval_original import evaluate_mmlu_batched
8
  import spaces
9
  import pandas as pd
10
+ import time # Added for timing functionality
11
 
12
  # Read token and login
13
  hf_token = os.getenv("HF_TOKEN_READ_WRITE")
 
28
  def load_model():
29
  """Loads the Mistral model and tokenizer and updates the load status."""
30
  global tokenizer, model, model_loaded
31
+ start_time = time.time() # Start timing
32
  try:
33
  if tokenizer is None:
34
  tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
 
40
  )
41
  model.to('cuda')
42
  model_loaded = True
43
+ elapsed_time = time.time() - start_time # Calculate elapsed time
44
+ return f"✅ Model Loaded in {elapsed_time:.2f} seconds!"
45
  except Exception as e:
46
  model_loaded = False
47
  return f"❌ Model Load Failed: {str(e)}"
48
  # ---------------------------------------------------------------------------
49
  # 2. Toy Evaluation
50
  # ---------------------------------------------------------------------------
51
+ @spaces.GPU(duration=120)
52
  def run_toy_evaluation():
53
  """Runs the toy dataset evaluation."""
54
  if not model_loaded:
 
57
  if not model_loaded:
58
  return "⚠️ Model not loaded. Please load the model first."
59
 
60
+ start_time = time.time() # Start timing
61
  results = evaluate_toy_dataset(model, tokenizer)
62
+ elapsed_time = time.time() - start_time # Calculate elapsed time
63
+
64
+ return f"{results}\n\nEvaluation completed in {elapsed_time:.2f} seconds.", \
65
+ f"<div>Time taken: {elapsed_time:.2f} seconds</div>" # Return timing info
66
 
67
  # ---------------------------------------------------------------------------
68
  # 3. MMLU Evaluation call
 
84
  load_model()
85
 
86
  if not model_loaded:
87
+ return "⚠️ Model not loaded. Please load the model first.", None
88
 
89
  # Convert num_subjects to -1 if all_subjects is True
90
  if all_subjects:
 
94
  if all_questions:
95
  num_questions = -1
96
 
97
+ # Run evaluation with timing
98
+ start_time = time.time() # Start timing
99
  results = evaluate_mmlu(
100
  model,
101
  tokenizer,
 
104
  num_shots=num_shots,
105
  auto_batch_size=True
106
  )
107
+ elapsed_time = time.time() - start_time # Calculate elapsed time
108
 
109
  # Format results
110
  overall_acc = results["overall_accuracy"]
 
114
  # Create DataFrame from results table
115
  results_df = pd.DataFrame(results["full_accuracy_table"])
116
 
117
+ # Calculate totals for the overall row
118
+ total_samples = results_df['Num_samples'].sum()
119
+ total_correct = results_df['Num_correct'].sum()
120
+
121
+ # Create overall row
122
+ overall_row = pd.DataFrame({
123
+ 'Subject': ['**Overall**'],
124
+ 'Num_samples': [total_samples],
125
+ 'Num_correct': [total_correct],
126
+ 'Accuracy': [overall_acc]
127
+ })
128
+
129
+ # Concatenate overall row with results
130
+ results_df = pd.concat([overall_row, results_df], ignore_index=True)
131
+
132
+ # Verify that the overall accuracy is consistent with the total correct/total samples
133
+ assert abs(overall_acc - (total_correct / total_samples)) < 1e-6, \
134
+ "Overall accuracy calculation mismatch detected"
135
+
136
  # Format the report
137
  report = (
138
  f"### Overall Results\n"
139
  f"* Overall Accuracy: {overall_acc:.3f}\n"
140
  f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
141
+ f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
142
+ f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
 
143
  )
144
 
145
+ return report, results_df # Return both text report and dataframe
146
 
147
  # ---------------------------------------------------------------------------
148
  # 4. Gradio Interface
 
213
  with gr.Row():
214
  eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
215
  results_output = gr.Markdown(label="Evaluation Results")
216
+ results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)")
217
 
218
  # Connect components
219
  load_button.click(fn=load_model, inputs=None, outputs=load_status)
 
261
  all_questions_checkbox,
262
  num_questions_slider
263
  ],
264
+ outputs=[results_output, results_table]
265
  )
266
 
267
+ demo.launch()