Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from toy_dataset_eval import evaluate_toy_dataset
|
|
7 |
from mmlu_eval_original import evaluate_mmlu_batched
|
8 |
import spaces
|
9 |
import pandas as pd
|
|
|
10 |
|
11 |
# Read token and login
|
12 |
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
|
@@ -27,6 +28,7 @@ model_loaded = False
|
|
27 |
def load_model():
|
28 |
"""Loads the Mistral model and tokenizer and updates the load status."""
|
29 |
global tokenizer, model, model_loaded
|
|
|
30 |
try:
|
31 |
if tokenizer is None:
|
32 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
|
@@ -38,14 +40,15 @@ def load_model():
|
|
38 |
)
|
39 |
model.to('cuda')
|
40 |
model_loaded = True
|
41 |
-
|
|
|
42 |
except Exception as e:
|
43 |
model_loaded = False
|
44 |
return f"❌ Model Load Failed: {str(e)}"
|
45 |
# ---------------------------------------------------------------------------
|
46 |
# 2. Toy Evaluation
|
47 |
# ---------------------------------------------------------------------------
|
48 |
-
@spaces.GPU
|
49 |
def run_toy_evaluation():
|
50 |
"""Runs the toy dataset evaluation."""
|
51 |
if not model_loaded:
|
@@ -54,8 +57,12 @@ def run_toy_evaluation():
|
|
54 |
if not model_loaded:
|
55 |
return "⚠️ Model not loaded. Please load the model first."
|
56 |
|
|
|
57 |
results = evaluate_toy_dataset(model, tokenizer)
|
58 |
-
|
|
|
|
|
|
|
59 |
|
60 |
# ---------------------------------------------------------------------------
|
61 |
# 3. MMLU Evaluation call
|
@@ -77,7 +84,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
77 |
load_model()
|
78 |
|
79 |
if not model_loaded:
|
80 |
-
return "⚠️ Model not loaded. Please load the model first."
|
81 |
|
82 |
# Convert num_subjects to -1 if all_subjects is True
|
83 |
if all_subjects:
|
@@ -87,7 +94,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
87 |
if all_questions:
|
88 |
num_questions = -1
|
89 |
|
90 |
-
# Run evaluation
|
|
|
91 |
results = evaluate_mmlu(
|
92 |
model,
|
93 |
tokenizer,
|
@@ -96,6 +104,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
96 |
num_shots=num_shots,
|
97 |
auto_batch_size=True
|
98 |
)
|
|
|
99 |
|
100 |
# Format results
|
101 |
overall_acc = results["overall_accuracy"]
|
@@ -105,17 +114,35 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
105 |
# Create DataFrame from results table
|
106 |
results_df = pd.DataFrame(results["full_accuracy_table"])
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
# Format the report
|
109 |
report = (
|
110 |
f"### Overall Results\n"
|
111 |
f"* Overall Accuracy: {overall_acc:.3f}\n"
|
112 |
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
|
113 |
-
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n
|
114 |
-
f"
|
115 |
-
f"{results_df.to_markdown()}\n"
|
116 |
)
|
117 |
|
118 |
-
return report
|
119 |
|
120 |
# ---------------------------------------------------------------------------
|
121 |
# 4. Gradio Interface
|
@@ -186,6 +213,7 @@ with gr.Blocks() as demo:
|
|
186 |
with gr.Row():
|
187 |
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
|
188 |
results_output = gr.Markdown(label="Evaluation Results")
|
|
|
189 |
|
190 |
# Connect components
|
191 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
@@ -233,7 +261,7 @@ with gr.Blocks() as demo:
|
|
233 |
all_questions_checkbox,
|
234 |
num_questions_slider
|
235 |
],
|
236 |
-
outputs=results_output
|
237 |
)
|
238 |
|
239 |
-
demo.launch()
|
|
|
7 |
from mmlu_eval_original import evaluate_mmlu_batched
|
8 |
import spaces
|
9 |
import pandas as pd
|
10 |
+
import time # Added for timing functionality
|
11 |
|
12 |
# Read token and login
|
13 |
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
|
|
|
28 |
def load_model():
|
29 |
"""Loads the Mistral model and tokenizer and updates the load status."""
|
30 |
global tokenizer, model, model_loaded
|
31 |
+
start_time = time.time() # Start timing
|
32 |
try:
|
33 |
if tokenizer is None:
|
34 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
|
|
|
40 |
)
|
41 |
model.to('cuda')
|
42 |
model_loaded = True
|
43 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
44 |
+
return f"✅ Model Loaded in {elapsed_time:.2f} seconds!"
|
45 |
except Exception as e:
|
46 |
model_loaded = False
|
47 |
return f"❌ Model Load Failed: {str(e)}"
|
48 |
# ---------------------------------------------------------------------------
|
49 |
# 2. Toy Evaluation
|
50 |
# ---------------------------------------------------------------------------
|
51 |
+
@spaces.GPU(duration=120)
|
52 |
def run_toy_evaluation():
|
53 |
"""Runs the toy dataset evaluation."""
|
54 |
if not model_loaded:
|
|
|
57 |
if not model_loaded:
|
58 |
return "⚠️ Model not loaded. Please load the model first."
|
59 |
|
60 |
+
start_time = time.time() # Start timing
|
61 |
results = evaluate_toy_dataset(model, tokenizer)
|
62 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
63 |
+
|
64 |
+
return f"{results}\n\nEvaluation completed in {elapsed_time:.2f} seconds.", \
|
65 |
+
f"<div>Time taken: {elapsed_time:.2f} seconds</div>" # Return timing info
|
66 |
|
67 |
# ---------------------------------------------------------------------------
|
68 |
# 3. MMLU Evaluation call
|
|
|
84 |
load_model()
|
85 |
|
86 |
if not model_loaded:
|
87 |
+
return "⚠️ Model not loaded. Please load the model first.", None
|
88 |
|
89 |
# Convert num_subjects to -1 if all_subjects is True
|
90 |
if all_subjects:
|
|
|
94 |
if all_questions:
|
95 |
num_questions = -1
|
96 |
|
97 |
+
# Run evaluation with timing
|
98 |
+
start_time = time.time() # Start timing
|
99 |
results = evaluate_mmlu(
|
100 |
model,
|
101 |
tokenizer,
|
|
|
104 |
num_shots=num_shots,
|
105 |
auto_batch_size=True
|
106 |
)
|
107 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
108 |
|
109 |
# Format results
|
110 |
overall_acc = results["overall_accuracy"]
|
|
|
114 |
# Create DataFrame from results table
|
115 |
results_df = pd.DataFrame(results["full_accuracy_table"])
|
116 |
|
117 |
+
# Calculate totals for the overall row
|
118 |
+
total_samples = results_df['Num_samples'].sum()
|
119 |
+
total_correct = results_df['Num_correct'].sum()
|
120 |
+
|
121 |
+
# Create overall row
|
122 |
+
overall_row = pd.DataFrame({
|
123 |
+
'Subject': ['**Overall**'],
|
124 |
+
'Num_samples': [total_samples],
|
125 |
+
'Num_correct': [total_correct],
|
126 |
+
'Accuracy': [overall_acc]
|
127 |
+
})
|
128 |
+
|
129 |
+
# Concatenate overall row with results
|
130 |
+
results_df = pd.concat([overall_row, results_df], ignore_index=True)
|
131 |
+
|
132 |
+
# Verify that the overall accuracy is consistent with the total correct/total samples
|
133 |
+
assert abs(overall_acc - (total_correct / total_samples)) < 1e-6, \
|
134 |
+
"Overall accuracy calculation mismatch detected"
|
135 |
+
|
136 |
# Format the report
|
137 |
report = (
|
138 |
f"### Overall Results\n"
|
139 |
f"* Overall Accuracy: {overall_acc:.3f}\n"
|
140 |
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
|
141 |
+
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
|
142 |
+
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
|
|
|
143 |
)
|
144 |
|
145 |
+
return report, results_df # Return both text report and dataframe
|
146 |
|
147 |
# ---------------------------------------------------------------------------
|
148 |
# 4. Gradio Interface
|
|
|
213 |
with gr.Row():
|
214 |
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
|
215 |
results_output = gr.Markdown(label="Evaluation Results")
|
216 |
+
results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)")
|
217 |
|
218 |
# Connect components
|
219 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
|
|
261 |
all_questions_checkbox,
|
262 |
num_questions_slider
|
263 |
],
|
264 |
+
outputs=[results_output, results_table]
|
265 |
)
|
266 |
|
267 |
+
demo.launch()
|