H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 18

Commit

671bd95

verified ·

1 Parent(s): 95f85dc

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -11

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from toy_dataset_eval import evaluate_toy_dataset
 from mmlu_eval_original import evaluate_mmlu_batched
 import spaces
 import pandas as pd
 # Read token and login
 hf_token = os.getenv("HF_TOKEN_READ_WRITE")
@@ -27,6 +28,7 @@ model_loaded = False
 def load_model():
     """Loads the Mistral model and tokenizer and updates the load status."""
     global tokenizer, model, model_loaded
     try:
         if tokenizer is None:
             tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
@@ -38,14 +40,15 @@ def load_model():
             )
             model.to('cuda')
         model_loaded = True
-        return "✅ Model Loaded!"
     except Exception as e:
         model_loaded = False
         return f"❌ Model Load Failed: {str(e)}"
 # ---------------------------------------------------------------------------
 # 2. Toy Evaluation
 # ---------------------------------------------------------------------------
-@spaces.GPU (duration=120)
 def run_toy_evaluation():
     """Runs the toy dataset evaluation."""
     if not model_loaded:
@@ -54,8 +57,12 @@ def run_toy_evaluation():
     if not model_loaded:
         return "⚠️ Model not loaded. Please load the model first."
     results = evaluate_toy_dataset(model, tokenizer)
-    return results  # Ensure load confirmation is shown before results
 # ---------------------------------------------------------------------------
 # 3. MMLU Evaluation call
@@ -77,7 +84,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
         load_model()
     if not model_loaded:
-        return "⚠️ Model not loaded. Please load the model first."
     # Convert num_subjects to -1 if all_subjects is True
     if all_subjects:
@@ -87,7 +94,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
     if all_questions:
         num_questions = -1
-    # Run evaluation
     results = evaluate_mmlu(
         model,
         tokenizer,
@@ -96,6 +104,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
         num_shots=num_shots,
         auto_batch_size=True
     )
     # Format results
     overall_acc = results["overall_accuracy"]
@@ -105,17 +114,35 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
     # Create DataFrame from results table
     results_df = pd.DataFrame(results["full_accuracy_table"])
     # Format the report
     report = (
         f"### Overall Results\n"
         f"* Overall Accuracy: {overall_acc:.3f}\n"
         f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
-        f"* Worst Performance: {min_subject} ({min_acc:.3f})\n\n"
-        f"### Detailed Results Table\n"
-        f"{results_df.to_markdown()}\n"
     )
-    return report
 # ---------------------------------------------------------------------------
 # 4. Gradio Interface
@@ -186,6 +213,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
         results_output = gr.Markdown(label="Evaluation Results")
     # Connect components
     load_button.click(fn=load_model, inputs=None, outputs=load_status)
@@ -233,7 +261,7 @@ with gr.Blocks() as demo:
             all_questions_checkbox,
             num_questions_slider
         ],
-        outputs=results_output
     )
-demo.launch()

 from mmlu_eval_original import evaluate_mmlu_batched
 import spaces
 import pandas as pd
+import time  # Added for timing functionality
 # Read token and login
 hf_token = os.getenv("HF_TOKEN_READ_WRITE")
 def load_model():
     """Loads the Mistral model and tokenizer and updates the load status."""
     global tokenizer, model, model_loaded
+    start_time = time.time()  # Start timing
     try:
         if tokenizer is None:
             tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
             )
             model.to('cuda')
         model_loaded = True
+        elapsed_time = time.time() - start_time  # Calculate elapsed time
+        return f"✅ Model Loaded in {elapsed_time:.2f} seconds!"
     except Exception as e:
         model_loaded = False
         return f"❌ Model Load Failed: {str(e)}"
 # ---------------------------------------------------------------------------
 # 2. Toy Evaluation
 # ---------------------------------------------------------------------------
+@spaces.GPU(duration=120)
 def run_toy_evaluation():
     """Runs the toy dataset evaluation."""
     if not model_loaded:
     if not model_loaded:
         return "⚠️ Model not loaded. Please load the model first."
+    start_time = time.time()  # Start timing
     results = evaluate_toy_dataset(model, tokenizer)
+    elapsed_time = time.time() - start_time  # Calculate elapsed time
+    return f"{results}\n\nEvaluation completed in {elapsed_time:.2f} seconds.", \
+           f"<div>Time taken: {elapsed_time:.2f} seconds</div>"  # Return timing info
 # ---------------------------------------------------------------------------
 # 3. MMLU Evaluation call
         load_model()
     if not model_loaded:
+        return "⚠️ Model not loaded. Please load the model first.", None
     # Convert num_subjects to -1 if all_subjects is True
     if all_subjects:
     if all_questions:
         num_questions = -1
+    # Run evaluation with timing
+    start_time = time.time()  # Start timing
     results = evaluate_mmlu(
         model,
         tokenizer,
         num_shots=num_shots,
         auto_batch_size=True
     )
+    elapsed_time = time.time() - start_time  # Calculate elapsed time
     # Format results
     overall_acc = results["overall_accuracy"]
     # Create DataFrame from results table
     results_df = pd.DataFrame(results["full_accuracy_table"])
+    # Calculate totals for the overall row
+    total_samples = results_df['Num_samples'].sum()
+    total_correct = results_df['Num_correct'].sum()
+    # Create overall row
+    overall_row = pd.DataFrame({
+        'Subject': ['**Overall**'],
+        'Num_samples': [total_samples],
+        'Num_correct': [total_correct],
+        'Accuracy': [overall_acc]
+    })
+    # Concatenate overall row with results
+    results_df = pd.concat([overall_row, results_df], ignore_index=True)
+    # Verify that the overall accuracy is consistent with the total correct/total samples
+    assert abs(overall_acc - (total_correct / total_samples)) < 1e-6, \
+        "Overall accuracy calculation mismatch detected"
     # Format the report
     report = (
         f"### Overall Results\n"
         f"* Overall Accuracy: {overall_acc:.3f}\n"
         f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
+        f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
+        f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
     )
+    return report, results_df  # Return both text report and dataframe
 # ---------------------------------------------------------------------------
 # 4. Gradio Interface
     with gr.Row():
         eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
         results_output = gr.Markdown(label="Evaluation Results")
+        results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)")
     # Connect components
     load_button.click(fn=load_model, inputs=None, outputs=load_status)
             all_questions_checkbox,
             num_questions_slider
         ],
+        outputs=[results_output, results_table]
     )
+demo.launch()