File size: 2,565 Bytes
cfbe56d 70267fe 2f24e2f 7ad5a82 70267fe 49a3d43 2f24e2f dcd6208 2f24e2f 70267fe 3946289 70267fe 49a3d43 d94d325 2f24e2f a0333c0 2f24e2f 902c41f d94d325 2f24e2f 7a03b9a 2f24e2f cfbe56d 7a03b9a 9d60753 99f10fb 7a03b9a 9d60753 cfbe56d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os
from datasets import load_dataset
import accelerate
# ํ๊ฒฝ ๋ณ์์์ ํ ํฐ ๊ฐ์ ธ์ค๊ธฐ
hf_token = os.environ.get("HF_TOKEN", None)
# Hugging Face ๋ก๊ทธ์ธ
if hf_token:
login(token=hf_token, add_to_git_credential=True)
else:
print("HF_TOKEN ํ๊ฒฝ ๋ณ์ ์ค์ ์ค๋ฅ")
# model, tokenizer ์
ํ
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=False ,token=hf_token)
#์ง์ด ์ต์
: device_map="auto"
# KMMLU ๋ฐ์ดํฐ์
๋ก๋
#dataset = load_dataset("HAERAE-HUB/KMMLU", "Accounting")
dataset = load_dataset("HAERAE-HUB/KMMLU")
df = dataset['test'].to_pandas()
def evaluate_model(question, choices):
prompt = f"์ง๋ฌธ: {question}\n\n์ ํ์ง:\n"
for i, choice in enumerate(choices):
prompt += f"{chr(65 + i)}. {choice}\n"
prompt += "\n๋ต๋ณ:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=1, temperature=0.0)
answer = tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
return answer
def run_kmmlu_test(subject):
subject_df = df[df['subject'] == subject]
correct = 0
total = len(subject_df)
results = []
for _, row in subject_df.iterrows():
question = row['question']
choices = [row['A'], row['B'], row['C'], row['D']]
correct_answer = row['answer']
model_answer = evaluate_model(question, choices)
is_correct = model_answer == correct_answer
if is_correct:
correct += 1
results.append(f"์ง๋ฌธ: {question}\n๋ชจ๋ธ ๋ต๋ณ: {model_answer}\n์ ๋ต: {correct_answer}\n์ ํ๋: {'๋ง์' if is_correct else 'ํ๋ฆผ'}\n")
accuracy = correct / total
summary = f"์ฃผ์ : {subject}\n์ ํ๋: {accuracy:.2%} ({correct}/{total})\n\n"
return summary + "\n".join(results)
subjects=df['subject'].unique().tolist()
iface = gr.Interface(
fn=run_kmmlu_test,
inputs="Accounting",
inputs=gr.Dropdown(choices=subjects, label="์ฃผ์ ์ ํ"),
outputs="text",
title="Llama 3๋ฅผ ์ด์ฉํ KMMLU ํ
์คํธ",
description="์ ํํ ์ฃผ์ ์ ๋ํด KMMLU ํ
์คํธ๋ฅผ ์คํํฉ๋๋ค."
)
iface.launch() |