Spaces:

Gatsby767
/

AbrahamicSolver

Running

App Files Files Community

AbrahamicSolver / results_recheck.py

Gatsby767

Upload 7 files

47a4065 verified 2 months ago

raw

history blame

2.49 kB

	import json
	from mathruler.grader import extract_boxed_content, grade_answer
	import openai
	import requests
	from tqdm import tqdm
	import random
	import argparse
	import os

	parser = argparse.ArgumentParser()
	parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct")
	args = parser.parse_args()

	STORAGE_PATH = os.getenv("STORAGE_PATH")
	api_urls = []
	api_keys=[]



	def process_example(answer, response):
	try:
	example = {
	"model": "gpt-4o",
	"messages": [
	{"role": "system", "content": "You are a math answer checker."},
	{"role": "user", "content": f"Hi, there is a answer: {answer}\n\n, and the ground truth answer is: {response}\n\n, please check whether the answer is correct or not, and return the only Yes or No."}
	],
	"temperature": 0.1
	}
	api_index = random.randint(0, len(api_urls)-1)
	api_url = api_urls[api_index]
	api_key = api_keys[api_index]
	response = requests.post(api_url, headers={"api-key": api_key,"Content-Type": "application/json"}, json=example, timeout=20)
	return response.json()['choices'][0]['message']['content']
	except Exception as e:
	print(e)
	return "No"
	new_results = []
	for model_name in [args.model_name]:
	for dataset in [
	"math",
	"gsm8k",
	"amc",
	"minerva",
	"olympiad",
	"aime2024",
	"aime2025",
	]:
	with open(f'{STORAGE_PATH}/evaluation/{model_name.replace("/","_")}/results_{dataset}.json', 'r') as f:
	results = json.load(f)

	for i in tqdm(range(len(results)-1)):
	if results[i]['score'] < 0.5:
	gpt_check = process_example(results[i]['answer'],results[i]['response'])
	if "yes" in gpt_check.lower():
	results[i]['score']=1
	new_results.append({
	'model': model_name,
	'dataset': dataset,
	'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2)
	})
	print(new_results)
	with open(f'final_results.jsonl', 'a') as f:
	json.dump({
	'model': model_name,
	'dataset': dataset,
	'score': round(sum([result['score'] for result in results[:-1]])/len(results[:-1])*100, 2)
	}, f)
	f.write('\n')