Spaces:

TeddyYao
/

grok4-gpqa-eval

Sleeping

App Files Files Community

grok4-gpqa-eval / app.py

TeddyYao

Upload 38 files

8474f02 verified about 1 month ago

raw

history blame

4.18 kB

	import gradio as gr
	import pandas as pd
	import json
	import os
	from datetime import datetime
	from dotenv import load_dotenv
	import time

	# Load environment variables
	load_dotenv()

	RESULTS_DIR = "results"
	PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")

	def load_progress():
	if not os.path.exists(PROGRESS_FILE):
	return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"

	try:
	df = pd.read_json(PROGRESS_FILE)
	if df.empty:
	return pd.DataFrame(), "Progress file is empty.", "N/A"

	# Calculate metrics
	total_questions = len(df)
	correct_answers = df['is_correct'].sum()
	accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
	avg_response_time = df['response_time'].mean()

	summary_text = f"""
	## Evaluation Progress
	- Questions Processed: {total_questions} / 448
	- Current Accuracy: {accuracy:.2f}%
	- Correct Answers: {correct_answers}
	- Average Response Time: {avg_response_time:.2f} seconds/question
	"""

	# Get last modified time
	last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')

	return df, summary_text, f"Last updated: {last_modified_time}"
	except Exception as e:
	return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"

	def create_ui():
	df, summary, last_updated = load_progress()

	with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
	gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
	gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")

	with gr.Row():
	summary_box = gr.Markdown(summary)
	last_updated_box = gr.Markdown(last_updated)

	with gr.Row():
	# Create a simple plot: number of correct vs incorrect answers
	if not df.empty:
	correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
	plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)

	gr.Markdown("## Raw Results")
	gr.DataFrame(df, wrap=True)

	return demo

	def check_environment():
	"""Check if all required environment variables are set"""
	issues = []

	if not os.getenv('GROK_API_KEY'):
	issues.append("GROK_API_KEY not found in environment")

	if not os.getenv('HF_TOKEN'):
	issues.append("HF_TOKEN not found (required for GPQA dataset access)")

	return issues

	def start_evaluation_safe():
	"""Safely start the evaluation process with error handling"""
	issues = check_environment()
	if issues:
	print("⚠️ Environment issues detected:")
	for issue in issues:
	print(f" - {issue}")
	print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
	return None

	import subprocess
	import sys

	print("Starting background evaluation process...")
	command = [
	sys.executable,
	"run_evaluation.py",
	"--config", "official_config.yaml",
	"--models", "grok-4-0709",
	"--benchmarks", "gpqa"
	]

	try:
	# Use Popen to run in the background
	process = subprocess.Popen(command)
	print(f"Evaluation process started with PID: {process.pid}")
	return process
	except Exception as e:
	print(f"Failed to start evaluation: {e}")
	return None

	if __name__ == "__main__":
	# Check environment first
	issues = check_environment()

	if issues:
	# Create UI with warning message
	ui = create_ui()
	print("\n⚠️ Running in demo mode due to missing configuration")
	else:
	# Start evaluation process
	process = start_evaluation_safe()
	ui = create_ui()

	# Launch the UI
	ui.launch(server_name="0.0.0.0", server_port=7860)