Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Upload 3 files
Browse files- evaluate.py +103 -0
- evaluate.sh +31 -0
- upload.py +60 -0
    	
        evaluate.py
    ADDED
    
    | @@ -0,0 +1,103 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import vllm
         | 
| 3 | 
            +
            from transformers import AutoTokenizer
         | 
| 4 | 
            +
            import argparse
         | 
| 5 | 
            +
            import re
         | 
| 6 | 
            +
            from evaluation.datasets_loader import get_dataset_handler
         | 
| 7 | 
            +
            from mathruler.grader import extract_boxed_content, grade_answer
         | 
| 8 | 
            +
            import os
         | 
| 9 | 
            +
            # math_verify = get_dataset_handler("math")
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            parser = argparse.ArgumentParser()
         | 
| 12 | 
            +
            parser.add_argument("--model", type=str, default="Qwen/Qwen3-4B-Base")
         | 
| 13 | 
            +
            parser.add_argument("--num_samples", type=int, default=10)
         | 
| 14 | 
            +
            parser.add_argument("--suffix", type=str, default="77")
         | 
| 15 | 
            +
            parser.add_argument("--save_name", type=str, default="")
         | 
| 16 | 
            +
            args = parser.parse_args()
         | 
| 17 | 
            +
            STORAGE_PATH = os.getenv("STORAGE_PATH")
         | 
| 18 | 
            +
            print('start load')
         | 
| 19 | 
            +
            with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json", "r") as f:
         | 
| 20 | 
            +
                data = json.load(f)
         | 
| 21 | 
            +
            os.remove(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}.json")
         | 
| 22 | 
            +
            # data  = [    {
         | 
| 23 | 
            +
            #         "question": "Solve the equation \\(|x^2 - 3x + 2| = |x - 1.5|\\). How many real solutions does this equation have?",
         | 
| 24 | 
            +
            #         "answer": "4",
         | 
| 25 | 
            +
            #         "score": 0
         | 
| 26 | 
            +
            #     }]
         | 
| 27 | 
            +
            def extract_answer(response):
         | 
| 28 | 
            +
                return re.search(r"\\boxed{(.*?)}", response).group(1)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            tokenizer = AutoTokenizer.from_pretrained(args.model)
         | 
| 31 | 
            +
            model = vllm.LLM(
         | 
| 32 | 
            +
                model=args.model,
         | 
| 33 | 
            +
                tokenizer=args.model,
         | 
| 34 | 
            +
                gpu_memory_utilization=0.85,
         | 
| 35 | 
            +
                seed=int(args.suffix),
         | 
| 36 | 
            +
            )
         | 
| 37 | 
            +
            sample_params = vllm.SamplingParams(
         | 
| 38 | 
            +
                max_tokens=4096,
         | 
| 39 | 
            +
                temperature=1.0,
         | 
| 40 | 
            +
                top_p=1.0,
         | 
| 41 | 
            +
                top_k=40,
         | 
| 42 | 
            +
                stop_token_ids=[tokenizer.eos_token_id],
         | 
| 43 | 
            +
                n=args.num_samples,
         | 
| 44 | 
            +
            )
         | 
| 45 | 
            +
            wrong_data = [item for item in data if item['score'] == -1]
         | 
| 46 | 
            +
            correct_data = [item for item in data if item['score'] == 0]  
         | 
| 47 | 
            +
            questions = [item["question"] for item in correct_data]
         | 
| 48 | 
            +
            answers = [item["answer"] for item in correct_data]
         | 
| 49 | 
            +
            chats = [[{"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},{"role": "user", "content": question}] for question in questions]
         | 
| 50 | 
            +
            if tokenizer.chat_template:
         | 
| 51 | 
            +
                prompts = [tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True, add_special_tokens=True) for chat in chats]
         | 
| 52 | 
            +
            else:
         | 
| 53 | 
            +
                prompts = ["system: " + chat[0]["content"] + '\n' + "user: " + chat[1]["content"] for chat in chats]
         | 
| 54 | 
            +
            responses = model.generate(prompts, sampling_params=sample_params,use_tqdm=True)
         | 
| 55 | 
            +
            print(len(data))
         | 
| 56 | 
            +
            results_all = []
         | 
| 57 | 
            +
            for response, answer, question in zip(responses, answers, questions):
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                try:
         | 
| 60 | 
            +
                    error_flag = False
         | 
| 61 | 
            +
                    count = 0
         | 
| 62 | 
            +
                    results = [extract_boxed_content(output.text) for output in response.outputs]
         | 
| 63 | 
            +
                    answer_counts = {}
         | 
| 64 | 
            +
                    for result in results:
         | 
| 65 | 
            +
                        found_match = False
         | 
| 66 | 
            +
                        # Check if result matches any existing answer group
         | 
| 67 | 
            +
                        try:
         | 
| 68 | 
            +
                            for existing_answer in answer_counts:
         | 
| 69 | 
            +
                                if grade_answer(result, existing_answer) or grade_answer(existing_answer, result) or result == existing_answer or ('no ' in result.lower() and 'no ' in existing_answer.lower()):
         | 
| 70 | 
            +
                                    answer_counts[existing_answer] += 1
         | 
| 71 | 
            +
                                    found_match = True
         | 
| 72 | 
            +
                                    break
         | 
| 73 | 
            +
                        except:
         | 
| 74 | 
            +
                            error_flag = True
         | 
| 75 | 
            +
                            break
         | 
| 76 | 
            +
                        # If no match found, create new answer group
         | 
| 77 | 
            +
                        if not found_match:
         | 
| 78 | 
            +
                            answer_counts[result] = 1
         | 
| 79 | 
            +
                    # print(answer_counts)
         | 
| 80 | 
            +
                    # Find the answer with the most matches
         | 
| 81 | 
            +
                    if error_flag:
         | 
| 82 | 
            +
                        continue
         | 
| 83 | 
            +
                    if answer_counts:
         | 
| 84 | 
            +
                        max_count = max(answer_counts.values())
         | 
| 85 | 
            +
                        majority_answer = max(answer_counts.items(), key=lambda x: x[1])[0]
         | 
| 86 | 
            +
                    # print(majority_answer)
         | 
| 87 | 
            +
                    score = max_count/len(results)
         | 
| 88 | 
            +
                    # print(score)
         | 
| 89 | 
            +
                    # print(majority_answer)
         | 
| 90 | 
            +
                    if "证明" in question or 'box' in question.lower() or 'text' in majority_answer.lower():
         | 
| 91 | 
            +
                        continue
         | 
| 92 | 
            +
                    results_all.append({"question": question, "answer": majority_answer, "score": score, 'results':results})
         | 
| 93 | 
            +
                except Exception as e:
         | 
| 94 | 
            +
                    print("Error:", e)
         | 
| 95 | 
            +
                    continue
         | 
| 96 | 
            +
                # print({"question": question, "answer": majority_answer, "score": score, 'results':results})
         | 
| 97 | 
            +
                # print(score,question,flush=True)
         | 
| 98 | 
            +
            print(len(results_all))
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            with open(f"{STORAGE_PATH}/generated_question/{args.save_name}_{args.suffix}_results.json", "w") as f:
         | 
| 101 | 
            +
                json.dump(results_all, f, indent=4)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                # break
         | 
    	
        evaluate.sh
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/bin/bash
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            model_name=$1
         | 
| 4 | 
            +
            save_name=$2
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            pids=()
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            for i in {0..7}; do
         | 
| 9 | 
            +
              CUDA_VISIBLE_DEVICES=$i python question_evaluate/evaluate.py --model $model_name --suffix $i --save_name $save_name &
         | 
| 10 | 
            +
              pids[$i]=$!
         | 
| 11 | 
            +
            done
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            wait ${pids[0]}
         | 
| 14 | 
            +
            echo "Task 0 finished."
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            timeout_duration=3600
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            (
         | 
| 19 | 
            +
              sleep $timeout_duration
         | 
| 20 | 
            +
              echo "Timeout reached. Killing remaining tasks..."
         | 
| 21 | 
            +
              for i in {1..7}; do
         | 
| 22 | 
            +
                if kill -0 ${pids[$i]} 2>/dev/null; then
         | 
| 23 | 
            +
                  kill -9 ${pids[$i]} 2>/dev/null
         | 
| 24 | 
            +
                  echo "Killed task $i"
         | 
| 25 | 
            +
                fi
         | 
| 26 | 
            +
              done
         | 
| 27 | 
            +
            ) &
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            for i in {1..7}; do
         | 
| 30 | 
            +
              wait ${pids[$i]} 2>/dev/null
         | 
| 31 | 
            +
            done
         | 
    	
        upload.py
    ADDED
    
    | @@ -0,0 +1,60 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import huggingface_hub
         | 
| 3 | 
            +
            from datasets import Dataset, DatasetDict
         | 
| 4 | 
            +
            from huggingface_hub import login
         | 
| 5 | 
            +
            import argparse
         | 
| 6 | 
            +
            import json
         | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            STORAGE_PATH = os.getenv("STORAGE_PATH")
         | 
| 9 | 
            +
            HUGGINGFACENAME = os.getenv("HUGGINGFACENAME")
         | 
| 10 | 
            +
            print(STORAGE_PATH)
         | 
| 11 | 
            +
            with open('tokens.json', 'r') as f:
         | 
| 12 | 
            +
                token = json.load(f)['huggingface']
         | 
| 13 | 
            +
            login(token=token)
         | 
| 14 | 
            +
            parser = argparse.ArgumentParser()
         | 
| 15 | 
            +
            parser.add_argument("--repo_name", type=str, default="")
         | 
| 16 | 
            +
            parser.add_argument("--max_score", type=float, default=0.7)
         | 
| 17 | 
            +
            parser.add_argument("--min_score", type=float, default=0.3)
         | 
| 18 | 
            +
            parser.add_argument("--experiment_name", type=str, default="Qwen_Qwen3-4B-Base_all")
         | 
| 19 | 
            +
            args = parser.parse_args()
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            datas= []
         | 
| 22 | 
            +
            for i in range(8):
         | 
| 23 | 
            +
                try:
         | 
| 24 | 
            +
                    with open(f'{STORAGE_PATH}/generated_question/{args.experiment_name}_{i}_results.json', 'r') as f:
         | 
| 25 | 
            +
                        data = json.load(f)
         | 
| 26 | 
            +
                        datas.extend(data)
         | 
| 27 | 
            +
                except:
         | 
| 28 | 
            +
                    print(f"File {args.experiment_name}_{i}_results.json not found")
         | 
| 29 | 
            +
                    continue
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            for i in range(8):
         | 
| 33 | 
            +
                try:
         | 
| 34 | 
            +
                    os.remove(f'{STORAGE_PATH}/generated_question/{args.experiment_name}_{i}_results.json')
         | 
| 35 | 
            +
                except:
         | 
| 36 | 
            +
                    print(f"File {args.experiment_name}_{i}_results.json not found")
         | 
| 37 | 
            +
                    continue
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            scores = [data['score'] for data in datas]
         | 
| 40 | 
            +
            #  print the distribution of scores
         | 
| 41 | 
            +
            import matplotlib.pyplot as plt
         | 
| 42 | 
            +
            plt.hist(scores, bins=11)
         | 
| 43 | 
            +
            plt.savefig('scores_distribution.png')
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            #count the number  of score between 0.2 and 0.8 
         | 
| 46 | 
            +
            if not args.repo_name == "":
         | 
| 47 | 
            +
                filtered_datas = [{'problem':data['question'],'answer':data['answer'],'score':data['score']} for data in datas if data['score'] >= args.min_score and data['score'] <= args.max_score and data['answer'] != '' and data['answer']!= 'None']
         | 
| 48 | 
            +
                print(len(filtered_datas))
         | 
| 49 | 
            +
                train_dataset = Dataset.from_list(filtered_datas)
         | 
| 50 | 
            +
                dataset_dict = {"train": train_dataset}
         | 
| 51 | 
            +
                config_name = f"{args.experiment_name}"
         | 
| 52 | 
            +
                dataset = DatasetDict(dataset_dict)
         | 
| 53 | 
            +
                dataset.push_to_hub(f"{HUGGINGFACENAME}/{args.repo_name}",private=True,config_name=config_name)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
             | 
| 58 | 
            +
             | 
| 59 | 
            +
             | 
| 60 | 
            +
             | 
