Spaces:

VIDraft
/

ThinkFlow-llama

Running on Zero

App Files Files Community

openfree commited on Mar 24

Commit

ccc2ed2

verified ·

1 Parent(s): 5969407

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -50

app.py CHANGED Viewed

@@ -3,7 +3,8 @@ import threading
 import gc
 import os
 import torch
 import gradio as gr
 import spaces
 import transformers
@@ -13,24 +14,26 @@ from huggingface_hub import login
 # 모델 메모리 관리 및 최적화를 위한 설정
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-MAX_GPU_MEMORY = 80 * 1024 * 1024 * 1024  # 80GB A100 기준 (실제 사용 가능한 메모리는 이보다 적음)
-# 사용 가능한 모델 목록 - A100에서 효율적으로 실행 가능한 모델로 필터링
 available_models = {
     "mistralai/Mistral-Small-3.1-24B-Base-2503": "Mistral Small 3.1 (24B)",
-    "bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF": "Mistral Small 3.1 GGUF (24B)",
     "google/gemma-3-27b-it": "Google Gemma 3 (27B)",
     "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen 2.5 Coder (32B)",
     "open-r1/OlympicCoder-32B": "Olympic Coder (32B)"
 }
-# 기본 모델 - available_models의 첫 번째 모델
 DEFAULT_MODEL_KEY = list(available_models.keys())[0]
 DEFAULT_MODEL_VALUE = available_models[DEFAULT_MODEL_KEY]
 # 모델 로드에 사용되는 전역 변수
 pipe = None
 current_model_name = None
 # Hugging Face 토큰으로 로그인 시도
 try:
@@ -70,33 +73,33 @@ latex_delimiters = [
 # 모델 크기 기반 구성 - 모델 크기에 따른 최적 설정 정의
 MODEL_CONFIG = {
     "small": {  # <10B
-        "max_memory": {0: "20GiB"},
         "offload": False,
         "quantization": None
     },
     "medium": {  # 10B-30B
-        "max_memory": {0: "40GiB"},
         "offload": False,
-        "quantization": None  # BitsAndBytes 문제로 양자화 비활성화
     },
     "large": {  # >30B
-        "max_memory": {0: "70GiB"},
         "offload": True,
-        "quantization": None  # BitsAndBytes 문제로 양자화 비활성화
     }
 }
 def get_model_size_category(model_name):
     """모델 크기 카테고리 결정"""
-    if "3B" in model_name or "8B" in model_name:
         return "small"
-    elif "24B" in model_name or "27B" in model_name:
         return "medium"
     elif "32B" in model_name or "70B" in model_name:
         return "large"
     else:
-        # 기본값으로 medium 반환
-        return "medium"
 def clear_gpu_memory():
     """GPU 메모리 정리"""
@@ -138,26 +141,36 @@ def rebuild_messages(history: list):
             messages.append({"role": h.role, "content": h.content})
     return messages
-def load_model(model_names):
     """선택된 모델 이름에 따라 모델 로드 (A100에 최적화된 설정 사용)"""
-    global pipe, current_model_name
-    # 기존 모델 정리
-    clear_gpu_memory()
-    # 모델이 선택되지 않았을 경우 기본값 지정
-    if not model_names:
-        model_name = DEFAULT_MODEL_KEY  # 첫 번째 사용 가능한 모델을 기본값으로 사용
-    else:
-        # 첫 번째 선택된 모델 사용
-        model_name = model_names[0]
-    # 모델 크기 카테고리 확인
-    size_category = get_model_size_category(model_name)
-    config = MODEL_CONFIG[size_category]
-    # 모델 로드 (크기에 따라 최적화된 설정 적용)
     try:
         # HF_TOKEN 환경 변수 확인
         hf_token = os.getenv("HF_TOKEN")
         # 공통 매개변수
@@ -166,14 +179,25 @@ def load_model(model_names):
             "trust_remote_code": True,
         }
-        # BitsAndBytes 사용 가능 여부 확인
         try:
             import bitsandbytes
             has_bitsandbytes = True
-            print("BitsAndBytes 라이브러리 로드 성공")
         except ImportError:
             has_bitsandbytes = False
-            print("BitsAndBytes 라이브러리를 찾을 수 없습니다. 양자화 없이 모델을 로드합니다.")
         # 양자화 설정이 필요하고 BitsAndBytes를 사용할 수 있는 경우
         if config["quantization"] and has_bitsandbytes:
@@ -184,6 +208,9 @@ def load_model(model_names):
                 bnb_4bit_compute_dtype=DTYPE
             )
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 device_map="auto",
@@ -204,6 +231,9 @@ def load_model(model_names):
             )
         else:
             # 양자화 없이 로드
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
@@ -212,10 +242,19 @@ def load_model(model_names):
                 **common_params
             )
         current_model_name = model_name
-        return f"모델 '{model_name}'이(가) 성공적으로 로드되었습니다. (최적화: {size_category} 카테고리)"
     except Exception as e:
         return f"모델 로드 실패: {str(e)}"
 @spaces.GPU
@@ -272,8 +311,6 @@ def bot(
         messages = rebuild_messages(history)
         # 타임아웃 설정
-        import signal
         class TimeoutError(Exception):
             pass
@@ -348,7 +385,6 @@ def bot(
                 continue
             # 최대 30초 대기 후 다음 단계로 진행
-            import time
             join_start_time = time.time()
             while t.is_alive() and (time.time() - join_start_time) < 30:
                 t.join(1)  # 1초마다 확인
@@ -390,6 +426,35 @@ def get_gpu_info():
     return "\n".join(gpu_info)
 # Gradio 인터페이스
 with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Service") as demo:
     # 상단에 타이틀과 설명 추가
@@ -423,7 +488,7 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
             gr.Markdown("""## 모델 선택""")
             model_selector = gr.Radio(
                 choices=list(available_models.values()),
-                value=DEFAULT_MODEL_VALUE,  # 올바른 기본 모델 설정
                 label="사용할 LLM 모델 선택",
             )
@@ -439,7 +504,7 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
                 num_tokens = gr.Slider(
                     50,
                     2000,
-                    1000,  # 기본값 축소
                     step=50,
                     label="추론 단계당 최대 토큰 수",
                     interactive=True,
@@ -447,7 +512,7 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
                 final_num_tokens = gr.Slider(
                     50,
                     3000,
-                    1500,  # 기본값 축소
                     step=50,
                     label="최종 답변의 최대 토큰 수",
                     interactive=True,
@@ -455,19 +520,12 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
                 do_sample = gr.Checkbox(True, label="샘플링 사용")
                 temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="온도")
-    # 자동 모델 로드 기능 추가
-    def auto_load_model():
-        # 첫 번째 모델 자동 로드
-        model_key = DEFAULT_MODEL_KEY
-        try:
-            result = load_model([model_key])
-            return result
-        except Exception as e:
-            return f"자동 모델 로드 실패: {str(e)}"
-    # 시작 시 자동으로 모델 로드 (스페이스가 시작될 때)
     demo.load(auto_load_model, [], [model_status])
     # 선택된 모델 로드 이벤트 연결
     def get_model_names(selected_model):
         # 표시 이름에서 원래 모델 이름으로 변환

 import gc
 import os
 import torch
+import time
+import signal
 import gradio as gr
 import spaces
 import transformers
 # 모델 메모리 관리 및 최적화를 위한 설정
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+MAX_GPU_MEMORY = 80 * 1024 * 1024 * 1024  # 80GB A100 기준
+# 사용 가능한 모델 목록 - 더 작은 모델부터 시작하도록 변경
 available_models = {
+    "google/gemma-2b": "Google Gemma (2B)",  # 더 작은 모델을 기본으로 설정
+    "mistralai/Mistral-7B-Instruct-v0.2": "Mistral 7B Instruct v0.2",
     "mistralai/Mistral-Small-3.1-24B-Base-2503": "Mistral Small 3.1 (24B)",
     "google/gemma-3-27b-it": "Google Gemma 3 (27B)",
     "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen 2.5 Coder (32B)",
     "open-r1/OlympicCoder-32B": "Olympic Coder (32B)"
 }
+# 기본 모델 - 가장 작은 모델로 설정
 DEFAULT_MODEL_KEY = list(available_models.keys())[0]
 DEFAULT_MODEL_VALUE = available_models[DEFAULT_MODEL_KEY]
 # 모델 로드에 사용되는 전역 변수
 pipe = None
 current_model_name = None
+loading_in_progress = False
 # Hugging Face 토큰으로 로그인 시도
 try:
 # 모델 크기 기반 구성 - 모델 크기에 따른 최적 설정 정의
 MODEL_CONFIG = {
     "small": {  # <10B
+        "max_memory": {0: "10GiB"},
         "offload": False,
         "quantization": None
     },
     "medium": {  # 10B-30B
+        "max_memory": {0: "30GiB"},
         "offload": False,
+        "quantization": None
     },
     "large": {  # >30B
+        "max_memory": {0: "60GiB"},
         "offload": True,
+        "quantization": None
     }
 }
 def get_model_size_category(model_name):
     """모델 크기 카테고리 결정"""
+    if "2B" in model_name or "3B" in model_name or "7B" in model_name or "8B" in model_name:
         return "small"
+    elif "15B" in model_name or "24B" in model_name or "27B" in model_name:
         return "medium"
     elif "32B" in model_name or "70B" in model_name:
         return "large"
     else:
+        # 기본값으로 small 반환 (안전을 위해)
+        return "small"
 def clear_gpu_memory():
     """GPU 메모리 정리"""
             messages.append({"role": h.role, "content": h.content})
     return messages
+def load_model(model_names, status_callback=None):
     """선택된 모델 이름에 따라 모델 로드 (A100에 최적화된 설정 사용)"""
+    global pipe, current_model_name, loading_in_progress
+    # 이미 로딩 중인 경우
+    if loading_in_progress:
+        return "다른 모델이 이미 로드 중입니다. 잠시 기다려주세요."
+    loading_in_progress = True
     try:
+        # 기존 모델 정리
+        clear_gpu_memory()
+        # 모델이 선택되지 않았을 경우 기본값 지정
+        if not model_names:
+            model_name = DEFAULT_MODEL_KEY
+        else:
+            # 첫 번째 선택된 모델 사용
+            model_name = model_names[0]
+        # 모델 크기 카테고리 확인
+        size_category = get_model_size_category(model_name)
+        config = MODEL_CONFIG[size_category]
+        # 로딩 상태 업데이트
+        if status_callback:
+            status_callback(f"모델 '{model_name}' 로드 중... (크기: {size_category})")
+        # 모델 로드 (크기에 따라 최적화된 설정 적용)
         # HF_TOKEN 환경 변수 확인
         hf_token = os.getenv("HF_TOKEN")
         # 공통 매개변수
             "trust_remote_code": True,
         }
+        # BitsAndBytes 사용 여부 확인
         try:
             import bitsandbytes
             has_bitsandbytes = True
         except ImportError:
             has_bitsandbytes = False
+            if status_callback:
+                status_callback(f"BitsAndBytes 라이브러리를 찾을 수 없습니다. 양자화 없이 로드합니다.")
+        # 시간 제한 설정 (모델 크기에 따라 다르게)
+        if size_category == "small":
+            load_timeout = 180  # 3분
+        elif size_category == "medium":
+            load_timeout = 300  # 5분
+        else:
+            load_timeout = 600  # 10분
+        # 로딩 시작 시간
+        start_time = time.time()
         # 양자화 설정이 필요하고 BitsAndBytes를 사용할 수 있는 경우
         if config["quantization"] and has_bitsandbytes:
                 bnb_4bit_compute_dtype=DTYPE
             )
+            if status_callback:
+                status_callback(f"모델 '{model_name}' 로드 중... (양자화 적용)")
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 device_map="auto",
             )
         else:
             # 양자화 없이 로드
+            if status_callback:
+                status_callback(f"모델 '{model_name}' 로드 중... (표준 방식)")
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
                 **common_params
             )
+        # 시간 제한 초과 확인
+        elapsed_time = time.time() - start_time
+        if elapsed_time > load_timeout:
+            clear_gpu_memory()
+            loading_in_progress = False
+            return f"모델 로드 시간 초과: {load_timeout}초가 지났습니다. 다시 시도하세요."
         current_model_name = model_name
+        loading_in_progress = False
+        return f"모델 '{model_name}'이(가) 성공적으로 로드되었습니다. (최적화: {size_category}, 소요시간: {elapsed_time:.1f}초)"
     except Exception as e:
+        loading_in_progress = False
         return f"모델 로드 실패: {str(e)}"
 @spaces.GPU
         messages = rebuild_messages(history)
         # 타임아웃 설정
         class TimeoutError(Exception):
             pass
                 continue
             # 최대 30초 대기 후 다음 단계로 진행
             join_start_time = time.time()
             while t.is_alive() and (time.time() - join_start_time) < 30:
                 t.join(1)  # 1초마다 확인
     return "\n".join(gpu_info)
+# 자동 모델 로드 함수 (상태 업데이트 포함)
+def auto_load_model():
+    # 첫 번째 모델 자동 로드
+    model_key = DEFAULT_MODEL_KEY
+    try:
+        # 진행 상태 표시를 위한 빈 결과 반환
+        return "작은 모델 자동 로드 중... 잠시 기다려주세요."
+    except Exception as e:
+        return f"자동 모델 로드 실패: {str(e)}"
+# 실제 모델 로드 함수 (비동기)
+def load_model_async(model_status):
+    # 비동기 함수로 모델 로드 (실제 로드는 백그라운드에서 수행)
+    model_key = DEFAULT_MODEL_KEY
+    def update_status(status):
+        model_status.update(value=status)
+    # 별도 스레드에서 로드
+    def load_in_thread():
+        try:
+            result = load_model([model_key], update_status)
+            model_status.update(value=result)
+        except Exception as e:
+            model_status.update(value=f"모델 로드 실패: {str(e)}")
+    threading.Thread(target=load_in_thread, daemon=True).start()
+    return "모델 로드 준비 중... 자동으로 진행됩니다."
 # Gradio 인터페이스
 with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Service") as demo:
     # 상단에 타이틀과 설명 추가
             gr.Markdown("""## 모델 선택""")
             model_selector = gr.Radio(
                 choices=list(available_models.values()),
+                value=DEFAULT_MODEL_VALUE,
                 label="사용할 LLM 모델 선택",
             )
                 num_tokens = gr.Slider(
                     50,
                     2000,
+                    1000,
                     step=50,
                     label="추론 단계당 최대 토큰 수",
                     interactive=True,
                 final_num_tokens = gr.Slider(
                     50,
                     3000,
+                    1500,
                     step=50,
                     label="최종 답변의 최대 토큰 수",
                     interactive=True,
                 do_sample = gr.Checkbox(True, label="샘플링 사용")
                 temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="온도")
+    # 시작 시 자동으로 초기화
     demo.load(auto_load_model, [], [model_status])
+    # 시작 후 비동기적으로 모델 로드 (초기 화면 표시 지연 방지)
+    demo.load(lambda x: load_model_async(x), [model_status], [], _js="() => {}")
     # 선택된 모델 로드 이벤트 연결
     def get_model_names(selected_model):
         # 표시 이름에서 원래 모델 이름으로 변환