File size: 2,446 Bytes
cfbe56d
 
 
 
70267fe
 
2f24e2f
7ad5a82
70267fe
49a3d43
dd69229
dcd6208
2f24e2f
70267fe
3946289
70267fe
49a3d43
d94d325
2f24e2f
a0333c0
2f24e2f
902c41f
 
d94d325
2f24e2f
dd69229
 
2f24e2f
cfbe56d
 
 
 
 
 
 
 
 
3433e6f
cfbe56d
 
 
 
f429607
cfbe56d
f429607
cfbe56d
 
f429607
cfbe56d
 
 
 
 
 
 
 
 
 
 
 
 
f429607
cfbe56d
 
7a03b9a
9d60753
 
f429607
 
9d60753
 
f429607
9d60753
cfbe56d
da49503
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os
from datasets import load_dataset
import accelerate

# ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ ํ† ํฐ ๊ฐ€์ ธ์˜ค๊ธฐ
hf_token = os.environ.get("HF_TOKEN")

# Hugging Face ๋กœ๊ทธ์ธ
if hf_token:
    login(token=hf_token, add_to_git_credential=True)
else:
    print("HF_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • ์˜ค๋ฅ˜")

# model, tokenizer ์…‹ํŒ…
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=False ,token=hf_token)
#์ง€์šด ์˜ต์…˜: device_map="auto"

# KMMLU ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
dataset = load_dataset("HAERAE-HUB/KMMLU", "Accounting")
#dataset = load_dataset("HAERAE-HUB/KMMLU")
df = dataset['test'].to_pandas()

def evaluate_model(question, choices):
    prompt = f"์งˆ๋ฌธ: {question}\n\n์„ ํƒ์ง€:\n"
    for i, choice in enumerate(choices):
        prompt += f"{chr(65 + i)}. {choice}\n"
    prompt += "\n๋‹ต๋ณ€:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1, temperature=0.1)
    
    answer = tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
    return answer

def run_kmmlu_test():
    correct = 0
    total = len(df)
    
    results = []
    for _, row in df.iterrows():
        question = row['question']
        choices = [row['A'], row['B'], row['C'], row['D']]
        correct_answer = row['answer']
        
        model_answer = evaluate_model(question, choices)
        is_correct = model_answer == correct_answer
        
        if is_correct:
            correct += 1
        
        results.append(f"์งˆ๋ฌธ: {question}\n๋ชจ๋ธ ๋‹ต๋ณ€: {model_answer}\n์ •๋‹ต: {correct_answer}\n์ •ํ™•๋„: {'๋งž์Œ' if is_correct else 'ํ‹€๋ฆผ'}\n")
    
    accuracy = correct / total
    summary = f"์ „์ฒด ํ…Œ์ŠคํŠธ ๊ฒฐ๊ณผ\n์ •ํ™•๋„: {accuracy:.2%} ({correct}/{total})\n\n"
    return summary + "\n".join(results)


iface = gr.Interface(
    fn=run_kmmlu_test,
    inputs=None,
    #inputs=gr.Dropdown(choices=subjects, label="์ฃผ์ œ ์„ ํƒ"),
    outputs="text",
    title="Llama 3๋ฅผ ์ด์šฉํ•œ KMMLU ํ…Œ์ŠคํŠธ",
    description="Accounting ์˜์—ญ์— ๋Œ€ํ•œ KMMLU ํ…Œ์ŠคํŠธ ์ˆ˜ํ–‰"
)

iface.launch(share=True)