File size: 2,505 Bytes
cfbe56d 70267fe 2f24e2f 7ad5a82 70267fe 2f24e2f dcd6208 2f24e2f 70267fe 3946289 70267fe d94d325 2f24e2f a0333c0 2f24e2f 7ad5a82 d94d325 2f24e2f cfbe56d a0333c0 cfbe56d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os
from datasets import load_dataset
import accelerate
# ํ๊ฒฝ ๋ณ์์์ ํ ํฐ์ ๊ฐ์ ธ์ค๊ธฐ
hf_token = os.environ.get("HF_TOKEN", None)
# Hugging Face ๋ก๊ทธ์ธ
if hf_token:
login(token=hf_token, add_to_git_credential=True)
else:
print("HF_TOKEN ํ๊ฒฝ ๋ณ์๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค.")
# model, tokenizer ์
ํ
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=False ,device_map="auto", token=hf_token)
# KMMLU ๋ฐ์ดํฐ์
๋ก๋
dataset = load_dataset("HAERAE-HUB/KMMLU", "Accounting")
df = dataset['test'].to_pandas()
def evaluate_model(question, choices):
prompt = f"์ง๋ฌธ: {question}\n\n์ ํ์ง:\n"
for i, choice in enumerate(choices):
prompt += f"{chr(65 + i)}. {choice}\n"
prompt += "\n๋ต๋ณ:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=1, temperature=0.0)
answer = tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
return answer
def run_kmmlu_test(subject):
subject_df = df[df['subject'] == subject]
correct = 0
total = len(subject_df)
results = []
for _, row in subject_df.iterrows():
question = row['question']
choices = [row['A'], row['B'], row['C'], row['D']]
correct_answer = row['answer']
model_answer = evaluate_model(question, choices)
is_correct = model_answer == correct_answer
if is_correct:
correct += 1
results.append(f"์ง๋ฌธ: {question}\n๋ชจ๋ธ ๋ต๋ณ: {model_answer}\n์ ๋ต: {correct_answer}\n์ ํ๋: {'๋ง์' if is_correct else 'ํ๋ฆผ'}\n")
accuracy = correct / total
summary = f"์ฃผ์ : {subject}\n์ ํ๋: {accuracy:.2%} ({correct}/{total})\n\n"
return summary + "\n".join(results)
subjects = df['subject'].unique().tolist()
iface = gr.Interface(
fn=run_kmmlu_test,
inputs=gr.Dropdown(choices=subjects, label="์ฃผ์ ์ ํ"),
outputs="text",
title="Llama 3๋ฅผ ์ด์ฉํ KMMLU ํ
์คํธ",
description="์ ํํ ์ฃผ์ ์ ๋ํด KMMLU ํ
์คํธ๋ฅผ ์คํํฉ๋๋ค."
)
iface.launch() |