Spaces:

kimhyunwoo
/

bitnet

Running

File size: 3,953 Bytes

f498762
9d9cc80
 
 
f498762
9d9cc80
 
 
f498762
9d9cc80
 
f498762
9d9cc80
 
ece9655
9d9cc80
 
 
 
ece9655
9d9cc80
 
 
 
 
ece9655
9d9cc80
 
ece9655
9d9cc80
 
 
 
ece9655
9d9cc80
f498762
9d9cc80
 
 
 
 
f498762
 
9d9cc80
 
 
 
f498762
9d9cc80
 
 
 
 
 
f498762
9d9cc80
 
 
 
 
 
 
 
 
 
f498762
9d9cc80
 
 
f498762
9d9cc80
f498762
9d9cc80
 
f498762
9d9cc80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ece9655
 
9d9cc80

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

# --- 모델 로드 ---
# 모델 경로 설정 (Hugging Face 모델 ID)
model_id = "microsoft/bitnet-b1.58-2B-4T"

# 모델 로드 시 경고 메시지를 최소화하기 위해 로깅 레벨 설정
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

# AutoModelForCausalLM과 AutoTokenizer를 로드합니다.
# BitNet 모델은 trust_remote_code=True가 필요합니다.
# GitHub 특정 브랜치에서 설치한 transformers를 사용합니다.
try:
    print(f"모델 로딩 중: {model_id}...")
    # GPU가 사용 가능하면 bf16 사용
    if torch.cuda.is_available():
        # torch_dtype을 명시적으로 설정하여 로드 오류 방지 시도
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        ).to("cuda") # GPU로 모델 이동
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        print("GPU를 사용하여 모델 로드 완료.")
    else:
        # CPU 사용 시 torch_dtype 생략 또는 float32
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        print("CPU를 사용하여 모델 로드 완료. 성능이 느릴 수 있습니다.")

except Exception as e:
    print(f"모델 로드 중 오류 발생: {e}")
    tokenizer = None
    model = None
    print("모델 로드에 실패했습니다. 애플리케이션이 제대로 동작하지 않을 수 있습니다.")


# --- 텍스트 생성 함수 ---
def generate_text(prompt, max_length=100, temperature=0.7):
    if model is None or tokenizer is None:
        return "모델 로드에 실패하여 텍스트 생성을 할 수 없습니다."

    try:
        # 프롬프트 토큰화
        inputs = tokenizer(prompt, return_tensors="pt")
        # GPU 사용 가능 시 GPU로 입력 이동
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        # 텍스트 생성
        # LLaMA 3 토크나이저를 사용하므로 chat template 적용 가능 (선택 사항)
        # 메시지 형식을 사용하지 않고 직접 프롬프트 입력 시 아래 코드 사용
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=temperature,
            do_sample=True, # 샘플링 활성화
            pad_token_id=tokenizer.eos_token_id # 패딩 토큰 ID 설정 (필요시)
        )

        # 생성된 텍스트 디코딩
        # 입력 프롬프트 부분을 제외하고 생성된 부분만 디코딩
        generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)

        return generated_text

    except Exception as e:
        return f"텍스트 생성 중 오류 발생: {e}"

# --- Gradio 인터페이스 설정 ---
if model is not None and tokenizer is not None:
    interface = gr.Interface(
        fn=generate_text,
        inputs=[
            gr.Textbox(lines=2, placeholder="텍스트를 입력하세요...", label="입력 프롬프트"),
            gr.Slider(minimum=10, maximum=500, value=100, label="최대 생성 길이"),
            gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature (창의성)")
        ],
        outputs=gr.Textbox(label="생성된 텍스트"),
        title="BitNet b1.58-2B-4T 텍스트 생성 데모",
        description="BitNet b1.58-2B-4T 모델을 사용하여 텍스트를 생성합니다."
    )

    # Gradio 앱 실행
    # Hugging Face Spaces에서는 share=True가 자동으로 설정됩니다.
    interface.launch()
else:
    print("모델 로드 실패로 인해 Gradio 인터페이스를 실행할 수 없습니다.")