grok4-gpqa-eval / app.py
TeddyYao's picture
Upload 38 files
8474f02 verified
raw
history blame
4.18 kB
import gradio as gr
import pandas as pd
import json
import os
from datetime import datetime
from dotenv import load_dotenv
import time
# Load environment variables
load_dotenv()
RESULTS_DIR = "results"
PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")
def load_progress():
if not os.path.exists(PROGRESS_FILE):
return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"
try:
df = pd.read_json(PROGRESS_FILE)
if df.empty:
return pd.DataFrame(), "Progress file is empty.", "N/A"
# Calculate metrics
total_questions = len(df)
correct_answers = df['is_correct'].sum()
accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
avg_response_time = df['response_time'].mean()
summary_text = f"""
## Evaluation Progress
- **Questions Processed:** {total_questions} / 448
- **Current Accuracy:** {accuracy:.2f}%
- **Correct Answers:** {correct_answers}
- **Average Response Time:** {avg_response_time:.2f} seconds/question
"""
# Get last modified time
last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')
return df, summary_text, f"Last updated: {last_modified_time}"
except Exception as e:
return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"
def create_ui():
df, summary, last_updated = load_progress()
with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")
with gr.Row():
summary_box = gr.Markdown(summary)
last_updated_box = gr.Markdown(last_updated)
with gr.Row():
# Create a simple plot: number of correct vs incorrect answers
if not df.empty:
correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)
gr.Markdown("## Raw Results")
gr.DataFrame(df, wrap=True)
return demo
def check_environment():
"""Check if all required environment variables are set"""
issues = []
if not os.getenv('GROK_API_KEY'):
issues.append("GROK_API_KEY not found in environment")
if not os.getenv('HF_TOKEN'):
issues.append("HF_TOKEN not found (required for GPQA dataset access)")
return issues
def start_evaluation_safe():
"""Safely start the evaluation process with error handling"""
issues = check_environment()
if issues:
print("⚠️ Environment issues detected:")
for issue in issues:
print(f" - {issue}")
print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
return None
import subprocess
import sys
print("Starting background evaluation process...")
command = [
sys.executable,
"run_evaluation.py",
"--config", "official_config.yaml",
"--models", "grok-4-0709",
"--benchmarks", "gpqa"
]
try:
# Use Popen to run in the background
process = subprocess.Popen(command)
print(f"Evaluation process started with PID: {process.pid}")
return process
except Exception as e:
print(f"Failed to start evaluation: {e}")
return None
if __name__ == "__main__":
# Check environment first
issues = check_environment()
if issues:
# Create UI with warning message
ui = create_ui()
print("\n⚠️ Running in demo mode due to missing configuration")
else:
# Start evaluation process
process = start_evaluation_safe()
ui = create_ui()
# Launch the UI
ui.launch(server_name="0.0.0.0", server_port=7860)