File size: 2,159 Bytes
cfbe56d
 
 
 
 
 
6cc26cb
cfbe56d
 
 
d94d325
 
 
 
 
 
cfbe56d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cc26cb
cfbe56d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# KMMLU ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ 
# ์ง์ ‘ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ df = pd.read_csv("kmmlu_sample.csv")

from datasets import load_dataset

df = load_dataset("HAERAE-HUB/KMMLU", "Accounting")

def evaluate_model(question, choices):
    prompt = f"์งˆ๋ฌธ: {question}\n\n์„ ํƒ์ง€:\n"
    for i, choice in enumerate(choices):
        prompt += f"{chr(65 + i)}. {choice}\n"
    prompt += "\n๋‹ต๋ณ€:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1, temperature=0.0)
    
    answer = tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
    return answer

def run_kmmlu_test(subject):
    subject_df = df[df['subject'] == subject]
    correct = 0
    total = len(subject_df)
    
    results = []
    for _, row in subject_df.iterrows():
        question = row['question']
        choices = [row['A'], row['B'], row['C'], row['D']]
        correct_answer = row['answer']
        
        model_answer = evaluate_model(question, choices)
        is_correct = model_answer == correct_answer
        
        if is_correct:
            correct += 1
        
        results.append(f"์งˆ๋ฌธ: {question}\n๋ชจ๋ธ ๋‹ต๋ณ€: {model_answer}\n์ •๋‹ต: {correct_answer}\n์ •ํ™•๋„: {'๋งž์Œ' if is_correct else 'ํ‹€๋ฆผ'}\n")
    
    accuracy = correct / total
    summary = f"์ฃผ์ œ: {subject}\n์ •ํ™•๋„: {accuracy:.2%} ({correct}/{total})\n\n"
    return summary + "\n".join(results)

subjects = df['subject'].unique().tolist()

iface = gr.Interface(
    fn=run_kmmlu_test,
    inputs=gr.Dropdown(choices=subjects, label="์ฃผ์ œ ์„ ํƒ"),
    outputs="text",
    title="Llama 3์„ ์ด์šฉํ•œ KMMLU ํ…Œ์ŠคํŠธ",
    description="์„ ํƒํ•œ ์ฃผ์ œ์— ๋Œ€ํ•ด KMMLU ํ…Œ์ŠคํŠธ๋ฅผ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค."
)

iface.launch()