|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import gc |
|
import os |
|
import datetime |
|
import time |
|
import spaces |
|
|
|
|
|
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" |
|
MAX_NEW_TOKENS = 512 |
|
CPU_THREAD_COUNT = 4 |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
if not HF_TOKEN: |
|
print("๊ฒฝ๊ณ : HF_TOKEN ํ๊ฒฝ ๋ณ์๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ๋น๊ณต๊ฐ ๋ชจ๋ธ์ ์ ๊ทผํ ์ ์์ ์ ์์ต๋๋ค.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
print("--- ํ๊ฒฝ ์ค์ ---") |
|
print(f"PyTorch ๋ฒ์ : {torch.__version__}") |
|
print(f"์คํ ์ฅ์น: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}") |
|
print(f"Torch ์ค๋ ๋: {torch.get_num_threads()}") |
|
print(f"HF_TOKEN ์ค์ ์ฌ๋ถ: {'์์' if HF_TOKEN else '์์'}") |
|
|
|
|
|
print(f"--- ๋ชจ๋ธ ๋ก๋ฉ ์ค: {MODEL_ID} ---") |
|
print("์ฒซ ์คํ ์ ๋ช ๋ถ ์ ๋ ์์๋ ์ ์์ต๋๋ค...") |
|
|
|
model = None |
|
tokenizer = None |
|
load_successful = False |
|
stop_token_ids_list = [] |
|
|
|
try: |
|
start_load_time = time.time() |
|
|
|
device_map = "auto" if torch.cuda.is_available() else "cpu" |
|
dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
|
|
|
|
tokenizer_kwargs = { |
|
"trust_remote_code": True |
|
} |
|
|
|
|
|
if HF_TOKEN: |
|
tokenizer_kwargs["token"] = HF_TOKEN |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
MODEL_ID, |
|
**tokenizer_kwargs |
|
) |
|
|
|
|
|
model_kwargs = { |
|
"torch_dtype": dtype, |
|
"device_map": device_map, |
|
"trust_remote_code": True |
|
} |
|
|
|
|
|
if HF_TOKEN: |
|
model_kwargs["token"] = HF_TOKEN |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_ID, |
|
**model_kwargs |
|
) |
|
|
|
model.eval() |
|
load_time = time.time() - start_load_time |
|
print(f"--- ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋ฉ ์๋ฃ: {load_time:.2f}์ด ์์ ---") |
|
load_successful = True |
|
|
|
|
|
stop_token_strings = ["</s>", "<|endoftext|>"] |
|
temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings] |
|
|
|
if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids: |
|
temp_stop_ids.append(tokenizer.eos_token_id) |
|
elif tokenizer.eos_token_id is None: |
|
print("๊ฒฝ๊ณ : tokenizer.eos_token_id๊ฐ None์
๋๋ค. ์ค์ง ํ ํฐ์ ์ถ๊ฐํ ์ ์์ต๋๋ค.") |
|
|
|
stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] |
|
|
|
if not stop_token_ids_list: |
|
print("๊ฒฝ๊ณ : ์ค์ง ํ ํฐ ID๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค. ๊ฐ๋ฅํ๋ฉด ๊ธฐ๋ณธ EOS๋ฅผ ์ฌ์ฉํ๊ณ , ๊ทธ๋ ์ง ์์ผ๋ฉด ์์ฑ์ด ์ฌ๋ฐ๋ฅด๊ฒ ์ค์ง๋์ง ์์ ์ ์์ต๋๋ค.") |
|
if tokenizer.eos_token_id is not None: |
|
stop_token_ids_list = [tokenizer.eos_token_id] |
|
else: |
|
print("์ค๋ฅ: ๊ธฐ๋ณธ EOS๋ฅผ ํฌํจํ์ฌ ์ค์ง ํ ํฐ์ ์ฐพ์ ์ ์์ต๋๋ค. ์์ฑ์ด ๋ฌดํ์ ์คํ๋ ์ ์์ต๋๋ค.") |
|
|
|
print(f"์ฌ์ฉํ ์ค์ง ํ ํฐ ID: {stop_token_ids_list}") |
|
|
|
except Exception as e: |
|
print(f"!!! ๋ชจ๋ธ ๋ก๋ฉ ์ค๋ฅ: {e}") |
|
if 'model' in locals() and model is not None: del model |
|
if 'tokenizer' in locals() and tokenizer is not None: del tokenizer |
|
gc.collect() |
|
raise gr.Error(f"๋ชจ๋ธ {MODEL_ID} ๋ก๋ฉ์ ์คํจํ์ต๋๋ค. ์ ํ๋ฆฌ์ผ์ด์
์ ์์ํ ์ ์์ต๋๋ค. ์ค๋ฅ: {e}") |
|
|
|
|
|
def get_system_prompt(): |
|
current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") |
|
return ( |
|
f"- ์ค๋์ {current_date}์
๋๋ค.\n" |
|
f"- ์ฌ์ฉ์์ ์ง๋ฌธ์ ๋ํด ์น์ ํ๊ณ ์์ธํ๊ฒ ํ๊ตญ์ด๋ก ๋ต๋ณํด์ผ ํฉ๋๋ค." |
|
) |
|
|
|
|
|
def warmup_model(): |
|
if not load_successful or model is None or tokenizer is None: |
|
print("์์
๊ฑด๋๋ฐ๊ธฐ: ๋ชจ๋ธ์ด ์ฑ๊ณต์ ์ผ๋ก ๋ก๋๋์ง ์์์ต๋๋ค.") |
|
return |
|
|
|
print("--- ๋ชจ๋ธ ์์
์์ ---") |
|
try: |
|
start_warmup_time = time.time() |
|
warmup_message = "์๋
ํ์ธ์" |
|
|
|
|
|
system_prompt = get_system_prompt() |
|
|
|
|
|
prompt = f"Human: {warmup_message}\nAssistant:" |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
gen_kwargs = { |
|
"max_new_tokens": 10, |
|
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, |
|
"do_sample": False |
|
} |
|
|
|
if stop_token_ids_list: |
|
gen_kwargs["eos_token_id"] = stop_token_ids_list |
|
else: |
|
print("์์
๊ฒฝ๊ณ : ์์ฑ์ ์ ์๋ ์ค์ง ํ ํฐ์ด ์์ต๋๋ค.") |
|
|
|
with torch.no_grad(): |
|
output_ids = model.generate(**inputs, **gen_kwargs) |
|
|
|
del inputs |
|
del output_ids |
|
gc.collect() |
|
warmup_time = time.time() - start_warmup_time |
|
print(f"--- ๋ชจ๋ธ ์์
์๋ฃ: {warmup_time:.2f}์ด ์์ ---") |
|
|
|
except Exception as e: |
|
print(f"!!! ๋ชจ๋ธ ์์
์ค ์ค๋ฅ ๋ฐ์: {e}") |
|
finally: |
|
gc.collect() |
|
|
|
|
|
@spaces.GPU() |
|
def predict(message, history): |
|
""" |
|
HyperCLOVAX-SEED-Vision-Instruct-3B ๋ชจ๋ธ์ ์ฌ์ฉํ์ฌ ์๋ต์ ์์ฑํฉ๋๋ค. |
|
'history'๋ Gradio 'messages' ํ์์ ๊ฐ์ ํฉ๋๋ค: List[Dict]. |
|
""" |
|
if model is None or tokenizer is None: |
|
return "์ค๋ฅ: ๋ชจ๋ธ์ด ๋ก๋๋์ง ์์์ต๋๋ค." |
|
|
|
|
|
history_text = "" |
|
if isinstance(history, list): |
|
for turn in history: |
|
if isinstance(turn, tuple) and len(turn) == 2: |
|
history_text += f"Human: {turn[0]}\nAssistant: {turn[1]}\n" |
|
|
|
|
|
prompt = f"{history_text}Human: {message}\nAssistant:" |
|
|
|
inputs = None |
|
output_ids = None |
|
|
|
try: |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
input_length = inputs.input_ids.shape[1] |
|
print(f"\n์
๋ ฅ ํ ํฐ ์: {input_length}") |
|
|
|
except Exception as e: |
|
print(f"!!! ์
๋ ฅ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {e}") |
|
return f"์ค๋ฅ: ์
๋ ฅ ํ์์ ์ฒ๋ฆฌํ๋ ์ค ๋ฌธ์ ๊ฐ ๋ฐ์ํ์ต๋๋ค. ({e})" |
|
|
|
try: |
|
print("์๋ต ์์ฑ ์ค...") |
|
generation_start_time = time.time() |
|
|
|
|
|
gen_kwargs = { |
|
"max_new_tokens": MAX_NEW_TOKENS, |
|
"pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, |
|
"do_sample": True, |
|
"temperature": 0.7, |
|
"top_p": 0.9, |
|
"repetition_penalty": 1.1 |
|
} |
|
|
|
if stop_token_ids_list: |
|
gen_kwargs["eos_token_id"] = stop_token_ids_list |
|
else: |
|
print("์์ฑ ๊ฒฝ๊ณ : ์ ์๋ ์ค์ง ํ ํฐ์ด ์์ต๋๋ค.") |
|
|
|
with torch.no_grad(): |
|
output_ids = model.generate(**inputs, **gen_kwargs) |
|
|
|
generation_time = time.time() - generation_start_time |
|
print(f"์์ฑ ์๋ฃ: {generation_time:.2f}์ด ์์.") |
|
|
|
except Exception as e: |
|
print(f"!!! ๋ชจ๋ธ ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {e}") |
|
if inputs is not None: del inputs |
|
if output_ids is not None: del output_ids |
|
gc.collect() |
|
return f"์ค๋ฅ: ์๋ต์ ์์ฑํ๋ ์ค ๋ฌธ์ ๊ฐ ๋ฐ์ํ์ต๋๋ค. ({e})" |
|
|
|
|
|
response = "์ค๋ฅ: ์๋ต ์์ฑ์ ์คํจํ์ต๋๋ค." |
|
if output_ids is not None: |
|
try: |
|
new_tokens = output_ids[0, input_length:] |
|
response = tokenizer.decode(new_tokens, skip_special_tokens=True) |
|
print(f"์ถ๋ ฅ ํ ํฐ ์: {len(new_tokens)}") |
|
del new_tokens |
|
except Exception as e: |
|
print(f"!!! ์๋ต ๋์ฝ๋ฉ ์ค ์ค๋ฅ ๋ฐ์: {e}") |
|
response = "์ค๋ฅ: ์๋ต์ ๋์ฝ๋ฉํ๋ ์ค ๋ฌธ์ ๊ฐ ๋ฐ์ํ์ต๋๋ค." |
|
|
|
|
|
if inputs is not None: del inputs |
|
if output_ids is not None: del output_ids |
|
gc.collect() |
|
print("๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ ์๋ฃ.") |
|
|
|
return response.strip() |
|
|
|
|
|
print("--- Gradio ์ธํฐํ์ด์ค ์ค์ ์ค ---") |
|
|
|
examples = [ |
|
["์๋
ํ์ธ์! ์๊ธฐ์๊ฐ ์ข ํด์ฃผ์ธ์."], |
|
["์ธ๊ณต์ง๋ฅ๊ณผ ๋จธ์ ๋ฌ๋์ ์ฐจ์ด์ ์ ๋ฌด์์ธ๊ฐ์?"], |
|
["๋ฅ๋ฌ๋ ๋ชจ๋ธ ํ์ต ๊ณผ์ ์ ๋จ๊ณ๋ณ๋ก ์๋ ค์ฃผ์ธ์."], |
|
["์ ์ฃผ๋ ์ฌํ ๊ณํ์ ์ธ์ฐ๊ณ ์๋๋ฐ, 3๋ฐ 4์ผ ์ถ์ฒ ์ฝ์ค ์ข ์๋ ค์ฃผ์ธ์."], |
|
] |
|
|
|
|
|
title = "๐ค HyperCLOVAX-SEED-Vision-Instruct-3B" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=predict, |
|
title=title, |
|
description=( |
|
f"**๋ชจ๋ธ:** {MODEL_ID}\n" |
|
), |
|
examples=examples, |
|
cache_examples=False, |
|
theme=gr.themes.Soft(), |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
if load_successful: |
|
warmup_model() |
|
else: |
|
print("๋ชจ๋ธ ๋ก๋ฉ์ ์คํจํ์ฌ ์์
์ ๊ฑด๋๋๋๋ค.") |
|
|
|
print("--- Gradio ์ฑ ์คํ ์ค ---") |
|
demo.queue().launch( |
|
|
|
|
|
) |