File size: 2,565 Bytes
cfbe56d
 
 
 
70267fe
 
2f24e2f
7ad5a82
70267fe
49a3d43
2f24e2f
dcd6208
2f24e2f
70267fe
3946289
70267fe
49a3d43
d94d325
2f24e2f
a0333c0
2f24e2f
902c41f
 
d94d325
2f24e2f
7a03b9a
 
2f24e2f
cfbe56d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a03b9a
 
9d60753
 
99f10fb
7a03b9a
9d60753
 
 
 
cfbe56d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os
from datasets import load_dataset
import accelerate

# ํ™˜๊ฒฝ ๋ณ€์ˆ˜์—์„œ ํ† ํฐ ๊ฐ€์ ธ์˜ค๊ธฐ
hf_token = os.environ.get("HF_TOKEN", None)

# Hugging Face ๋กœ๊ทธ์ธ
if hf_token:
    login(token=hf_token, add_to_git_credential=True)
else:
    print("HF_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • ์˜ค๋ฅ˜")

# model, tokenizer ์…‹ํŒ…
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=False ,token=hf_token)
#์ง€์šด ์˜ต์…˜: device_map="auto"

# KMMLU ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
#dataset = load_dataset("HAERAE-HUB/KMMLU", "Accounting")
dataset = load_dataset("HAERAE-HUB/KMMLU")
df = dataset['test'].to_pandas()

def evaluate_model(question, choices):
    prompt = f"์งˆ๋ฌธ: {question}\n\n์„ ํƒ์ง€:\n"
    for i, choice in enumerate(choices):
        prompt += f"{chr(65 + i)}. {choice}\n"
    prompt += "\n๋‹ต๋ณ€:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1, temperature=0.0)
    
    answer = tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
    return answer

def run_kmmlu_test(subject):
    subject_df = df[df['subject'] == subject]
    correct = 0
    total = len(subject_df)
    
    results = []
    for _, row in subject_df.iterrows():
        question = row['question']
        choices = [row['A'], row['B'], row['C'], row['D']]
        correct_answer = row['answer']
        
        model_answer = evaluate_model(question, choices)
        is_correct = model_answer == correct_answer
        
        if is_correct:
            correct += 1
        
        results.append(f"์งˆ๋ฌธ: {question}\n๋ชจ๋ธ ๋‹ต๋ณ€: {model_answer}\n์ •๋‹ต: {correct_answer}\n์ •ํ™•๋„: {'๋งž์Œ' if is_correct else 'ํ‹€๋ฆผ'}\n")
    
    accuracy = correct / total
    summary = f"์ฃผ์ œ: {subject}\n์ •ํ™•๋„: {accuracy:.2%} ({correct}/{total})\n\n"
    return summary + "\n".join(results)

subjects=df['subject'].unique().tolist()

iface = gr.Interface(
    fn=run_kmmlu_test,
    inputs="Accounting",
    inputs=gr.Dropdown(choices=subjects, label="์ฃผ์ œ ์„ ํƒ"),
    outputs="text",
    title="Llama 3๋ฅผ ์ด์šฉํ•œ KMMLU ํ…Œ์ŠคํŠธ",
    description="์„ ํƒํ•œ ์ฃผ์ œ์— ๋Œ€ํ•ด KMMLU ํ…Œ์ŠคํŠธ๋ฅผ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค."
)

iface.launch()