Spaces:

kimhyunwoo
/

gggg

Sleeping

App Files Files Community

kimhyunwoo commited on Apr 12

Commit

3b77cfa

verified ·

1 Parent(s): afe67d3

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import gradio as gr
+import torch
+import os
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForCausalLM
+# --- Configuration ---
+# 사용자님이 지정한 ONNX 모델 ID
+MODEL_ID = "onnx-community/gemma-3-1b-it-ONNX-GQA"
+# 양자화된 모델 파일 이름 (저장소 구조 확인 필요, 없을 경우 일반 모델 시도)
+# Q4 모델 파일이 'onnx/model_q4.onnx' 형태일 수 있음 -> optimum 이 자동 감지 시도
+# 우선 명시적 파일 지정 없이 로드 시도
+ONNX_FILE_NAME = None # e.g., "onnx/model_q4.onnx" if needed and present
+# Hugging Face Hub 토큰 (필요시 - Gemma 모델은 Gated일 수 있으나 ONNX 커뮤니티 버전은 아닐 수 있음)
+# HF_TOKEN = os.getenv("HF_TOKEN") # Space secrets 에서 설정
+# --- Device Selection ---
+try:
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        provider = "CUDAExecutionProvider"
+        print("Using GPU (CUDA).")
+    # Mps (Apple Silicon) - Gradio Spaces 에서는 사용 불가 가능성 높음
+    # elif torch.backends.mps.is_available():
+    #     device = "mps"
+    #     provider = "CoreMLExecutionProvider" # Needs check
+    #     print("Using MPS (Apple Silicon).")
+    else:
+        device = "cpu"
+        provider = "CPUExecutionProvider"
+        print("Using CPU.")
+except Exception as e:
+    print(f"Device detection error: {e}. Defaulting to CPU.")
+    device = "cpu"
+    provider = "CPUExecutionProvider"
+# --- Model and Tokenizer Loading ---
+print(f"Attempting to load model: {MODEL_ID}")
+print(f"Using device: {device}, Execution Provider: {provider}")
+try:
+    # 토크나이저 로드
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) #, token=HF_TOKEN)
+    print("Tokenizer loaded successfully.")
+    # ONNX 모델 로드 (Optimum 사용)
+    # provider_options 설정 (필요시 추가 최적화 가능)
+    model = ORTModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        # filename=ONNX_FILE_NAME, # 파일명 명시가 필요 없을 수 있음 (자동 감지)
+        provider=provider,
+        # use_auth_token=HF_TOKEN, # Gated 모델일 경우 필요
+        use_cache=True, # KV 캐시 사용
+        # provider_options={'enable_skip_layer_norm_strict_mode': True} # 예시 옵션
+    )
+    # 모델을 지정된 디바이스로 이동 (ORTModel 은 내부적으로 처리할 수 있으나 명시 가능)
+    # model.to(device) # ORTModel 에서는 .to() 가 없을 수 있음, provider 지정으로 처리
+    print(f"ONNX Model '{MODEL_ID}' loaded successfully with provider '{provider}'.")
+    model_loaded_successfully = True
+except Exception as e:
+    print(f"!!!!!!!!!!!!!! Error loading model {MODEL_ID} !!!!!!!!!!!!!!")
+    print(f"Error type: {type(e).__name__}")
+    print(f"Error message: {e}")
+    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+    model_loaded_successfully = False
+    # 모델 로딩 실패 시 Gradio 앱 실행 중단 또는 오류 메시지 표시
+    # raise gr.Error(f"CRITICAL: Failed to load model '{MODEL_ID}'. Check logs. Error: {e}")
+# --- Chat Function ---
+def chat_function(message: str, history: list):
+    if not model_loaded_successfully:
+        return "Error: The AI model failed to load. Cannot generate response."
+    # Gemma Instruct 형식에 맞게 history 와 message 를 프롬프트로 변환
+    # AutoTokenizer 에 chat_template 이 정의되어 있으면 사용 권장
+    try:
+        # [[user_msg1, model_msg1], ...] -> [{"role": "user", "content": ...}, ...]
+        chat_messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
+        for user_msg, model_msg in history:
+            chat_messages.append({"role": "user", "content": user_msg})
+            chat_messages.append({"role": "model", "content": model_msg})
+        chat_messages.append({"role": "user", "content": message})
+        # 토크나이저의 apply_chat_template 사용 (Gemma 지원 확인 필요)
+        try:
+             prompt = tokenizer.apply_chat_template(
+                 chat_messages,
+                 tokenize=False,
+                 add_generation_prompt=True # 모델이 응답을 시작하도록 프롬프트 추가
+             )
+        except Exception as template_error:
+             # 템플릿 적용 실패 시 수동 구성 (이전 JS 버전 방식)
+             print(f"Warning: Failed to apply chat template ({template_error}). Falling back to manual prompt construction.")
+             prompt_parts = ["<start_of_turn>system\nYou are a helpful AI assistant.<end_of_turn>"]
+             for user_msg, model_msg in history:
+                 prompt_parts.append(f"<start_of_turn>user\n{user_msg}<end_of_turn>")
+                 prompt_parts.append(f"<start_of_turn>model\n{model_msg}<end_of_turn>")
+             prompt_parts.append(f"<start_of_turn>user\n{message}<end_of_turn>")
+             prompt_parts.append("<start_of_turn>model")
+             prompt = "\n".join(prompt_parts)
+        # print("\n--- Prompt ---")
+        # print(prompt)
+        # print("--------------\n")
+        # 입력 토큰화
+        inputs = tokenizer(prompt, return_tensors="pt").to(device) # 모델과 같은 디바이스로
+        # 응답 생성
+        print("Generating response...")
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            do_sample=True,
+            temperature=0.7,
+            top_k=50,
+            top_p=0.9,
+            # pad_token_id=tokenizer.eos_token_id # 패딩 설정 필요시
+        )
+        print("Generation complete.")
+        # 생성된 토큰 디코딩 (입력 부분 제외)
+        # inputs[0] 대신 inputs['input_ids'][0] 사용해야 할 수 있음
+        input_token_len = inputs['input_ids'].shape[1]
+        generated_tokens = outputs[0][input_token_len:]
+        response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        # 종료 토큰 또는 불필요한 후행 텍스트 제거
+        response = response.replace("<end_of_turn>", "").strip()
+        # print("\n--- Response ---")
+        # print(response)
+        # print("--------------\n")
+        # history.append((message, response)) # history 는 Gradio 가 관리
+        return response
+    except Exception as e:
+        print(f"Error during generation: {e}")
+        # 사용자에게 표시될 수 있는 안전한 오류 메시지 반환
+        return f"Sorry, an error occurred during response generation. Please check the application logs for details."
+# --- Gradio Interface ---
+print("Creating Gradio Interface...")
+iface = gr.ChatInterface(
+    fn=chat_function if model_loaded_successfully else lambda msg, hist: "Model not loaded.", # 모델 로드 실패시 대체 함수
+    title="AI Assistant (Gemma 3 1B ONNX)",
+    description=f"Chat with {MODEL_ID}. Model loaded: {model_loaded_successfully}",
+    chatbot=gr.Chatbot(height=600),
+    textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
+    submit_btn="Send",
+    retry_btn="Retry",
+    undo_btn="Undo",
+    clear_btn="Clear",
+    theme=gr.themes.Soft(), # 테마 적용
+    examples=[["Hello!"], ["Write a poem about the internet."]]
+)
+# --- Launch App ---
+if __name__ == "__main__":
+    print("Launching Gradio App...")
+    # share=True 로 설정하면 외부에서 접근 가능한 링크 생성 (보안 주의)
+    iface.launch()#share=True)