openfree commited on
Commit
ccc2ed2
ยท
verified ยท
1 Parent(s): 5969407

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -50
app.py CHANGED
@@ -3,7 +3,8 @@ import threading
3
  import gc
4
  import os
5
  import torch
6
-
 
7
  import gradio as gr
8
  import spaces
9
  import transformers
@@ -13,24 +14,26 @@ from huggingface_hub import login
13
  # ๋ชจ๋ธ ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ ๋ฐ ์ตœ์ ํ™”๋ฅผ ์œ„ํ•œ ์„ค์ •
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
  DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
16
- MAX_GPU_MEMORY = 80 * 1024 * 1024 * 1024 # 80GB A100 ๊ธฐ์ค€ (์‹ค์ œ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ฉ”๋ชจ๋ฆฌ๋Š” ์ด๋ณด๋‹ค ์ ์Œ)
17
 
18
- # ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ชจ๋ธ ๋ชฉ๋ก - A100์—์„œ ํšจ์œจ์ ์œผ๋กœ ์‹คํ–‰ ๊ฐ€๋Šฅํ•œ ๋ชจ๋ธ๋กœ ํ•„ํ„ฐ๋ง
19
  available_models = {
 
 
20
  "mistralai/Mistral-Small-3.1-24B-Base-2503": "Mistral Small 3.1 (24B)",
21
- "bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF": "Mistral Small 3.1 GGUF (24B)",
22
  "google/gemma-3-27b-it": "Google Gemma 3 (27B)",
23
  "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen 2.5 Coder (32B)",
24
  "open-r1/OlympicCoder-32B": "Olympic Coder (32B)"
25
  }
26
 
27
- # ๊ธฐ๋ณธ ๋ชจ๋ธ - available_models์˜ ์ฒซ ๋ฒˆ์งธ ๋ชจ๋ธ
28
  DEFAULT_MODEL_KEY = list(available_models.keys())[0]
29
  DEFAULT_MODEL_VALUE = available_models[DEFAULT_MODEL_KEY]
30
 
31
  # ๋ชจ๋ธ ๋กœ๋“œ์— ์‚ฌ์šฉ๋˜๋Š” ์ „์—ญ ๋ณ€์ˆ˜
32
  pipe = None
33
  current_model_name = None
 
34
 
35
  # Hugging Face ํ† ํฐ์œผ๋กœ ๋กœ๊ทธ์ธ ์‹œ๋„
36
  try:
@@ -70,33 +73,33 @@ latex_delimiters = [
70
  # ๋ชจ๋ธ ํฌ๊ธฐ ๊ธฐ๋ฐ˜ ๊ตฌ์„ฑ - ๋ชจ๋ธ ํฌ๊ธฐ์— ๋”ฐ๋ฅธ ์ตœ์  ์„ค์ • ์ •์˜
71
  MODEL_CONFIG = {
72
  "small": { # <10B
73
- "max_memory": {0: "20GiB"},
74
  "offload": False,
75
  "quantization": None
76
  },
77
  "medium": { # 10B-30B
78
- "max_memory": {0: "40GiB"},
79
  "offload": False,
80
- "quantization": None # BitsAndBytes ๋ฌธ์ œ๋กœ ์–‘์žํ™” ๋น„ํ™œ์„ฑํ™”
81
  },
82
  "large": { # >30B
83
- "max_memory": {0: "70GiB"},
84
  "offload": True,
85
- "quantization": None # BitsAndBytes ๋ฌธ์ œ๋กœ ์–‘์žํ™” ๋น„ํ™œ์„ฑํ™”
86
  }
87
  }
88
 
89
  def get_model_size_category(model_name):
90
  """๋ชจ๋ธ ํฌ๊ธฐ ์นดํ…Œ๊ณ ๋ฆฌ ๊ฒฐ์ •"""
91
- if "3B" in model_name or "8B" in model_name:
92
  return "small"
93
- elif "24B" in model_name or "27B" in model_name:
94
  return "medium"
95
  elif "32B" in model_name or "70B" in model_name:
96
  return "large"
97
  else:
98
- # ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ medium ๋ฐ˜ํ™˜
99
- return "medium"
100
 
101
  def clear_gpu_memory():
102
  """GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ"""
@@ -138,26 +141,36 @@ def rebuild_messages(history: list):
138
  messages.append({"role": h.role, "content": h.content})
139
  return messages
140
 
141
- def load_model(model_names):
142
  """์„ ํƒ๋œ ๋ชจ๋ธ ์ด๋ฆ„์— ๋”ฐ๋ผ ๋ชจ๋ธ ๋กœ๋“œ (A100์— ์ตœ์ ํ™”๋œ ์„ค์ • ์‚ฌ์šฉ)"""
143
- global pipe, current_model_name
144
-
145
- # ๊ธฐ์กด ๋ชจ๋ธ ์ •๋ฆฌ
146
- clear_gpu_memory()
147
 
148
- # ๋ชจ๋ธ์ด ์„ ํƒ๋˜์ง€ ์•Š์•˜์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’ ์ง€์ •
149
- if not model_names:
150
- model_name = DEFAULT_MODEL_KEY # ์ฒซ ๋ฒˆ์งธ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ชจ๋ธ์„ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์‚ฌ์šฉ
151
- else:
152
- # ์ฒซ ๋ฒˆ์งธ ์„ ํƒ๋œ ๋ชจ๋ธ ์‚ฌ์šฉ
153
- model_name = model_names[0]
154
 
155
- # ๋ชจ๋ธ ํฌ๊ธฐ ์นดํ…Œ๊ณ ๋ฆฌ ํ™•์ธ
156
- size_category = get_model_size_category(model_name)
157
- config = MODEL_CONFIG[size_category]
158
 
159
- # ๋ชจ๋ธ ๋กœ๋“œ (ํฌ๊ธฐ์— ๋”ฐ๋ผ ์ตœ์ ํ™”๋œ ์„ค์ • ์ ์šฉ)
160
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # HF_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
162
  hf_token = os.getenv("HF_TOKEN")
163
  # ๊ณตํ†ต ๋งค๊ฐœ๋ณ€์ˆ˜
@@ -166,14 +179,25 @@ def load_model(model_names):
166
  "trust_remote_code": True,
167
  }
168
 
169
- # BitsAndBytes ์‚ฌ์šฉ ๊ฐ€๋Šฅ ์—ฌ๋ถ€ ํ™•์ธ
170
  try:
171
  import bitsandbytes
172
  has_bitsandbytes = True
173
- print("BitsAndBytes ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋กœ๋“œ ์„ฑ๊ณต")
174
  except ImportError:
175
  has_bitsandbytes = False
176
- print("BitsAndBytes ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์–‘์žํ™” ์—†์ด ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.")
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  # ์–‘์žํ™” ์„ค์ •์ด ํ•„์š”ํ•˜๊ณ  BitsAndBytes๋ฅผ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๊ฒฝ์šฐ
179
  if config["quantization"] and has_bitsandbytes:
@@ -184,6 +208,9 @@ def load_model(model_names):
184
  bnb_4bit_compute_dtype=DTYPE
185
  )
186
 
 
 
 
187
  model = AutoModelForCausalLM.from_pretrained(
188
  model_name,
189
  device_map="auto",
@@ -204,6 +231,9 @@ def load_model(model_names):
204
  )
205
  else:
206
  # ์–‘์žํ™” ์—†์ด ๋กœ๋“œ
 
 
 
207
  pipe = pipeline(
208
  "text-generation",
209
  model=model_name,
@@ -212,10 +242,19 @@ def load_model(model_names):
212
  **common_params
213
  )
214
 
 
 
 
 
 
 
 
215
  current_model_name = model_name
216
- return f"๋ชจ๋ธ '{model_name}'์ด(๊ฐ€) ์„ฑ๊ณต์ ์œผ๋กœ ๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. (์ตœ์ ํ™”: {size_category} ์นดํ…Œ๊ณ ๋ฆฌ)"
 
217
 
218
  except Exception as e:
 
219
  return f"๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {str(e)}"
220
 
221
  @spaces.GPU
@@ -272,8 +311,6 @@ def bot(
272
  messages = rebuild_messages(history)
273
 
274
  # ํƒ€์ž„์•„์›ƒ ์„ค์ •
275
- import signal
276
-
277
  class TimeoutError(Exception):
278
  pass
279
 
@@ -348,7 +385,6 @@ def bot(
348
  continue
349
 
350
  # ์ตœ๋Œ€ 30์ดˆ ๋Œ€๊ธฐ ํ›„ ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ง„ํ–‰
351
- import time
352
  join_start_time = time.time()
353
  while t.is_alive() and (time.time() - join_start_time) < 30:
354
  t.join(1) # 1์ดˆ๋งˆ๋‹ค ํ™•์ธ
@@ -390,6 +426,35 @@ def get_gpu_info():
390
 
391
  return "\n".join(gpu_info)
392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
394
  with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Service") as demo:
395
  # ์ƒ๋‹จ์— ํƒ€์ดํ‹€๊ณผ ์„ค๋ช… ์ถ”๊ฐ€
@@ -423,7 +488,7 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
423
  gr.Markdown("""## ๋ชจ๋ธ ์„ ํƒ""")
424
  model_selector = gr.Radio(
425
  choices=list(available_models.values()),
426
- value=DEFAULT_MODEL_VALUE, # ์˜ฌ๋ฐ”๋ฅธ ๊ธฐ๋ณธ ๋ชจ๋ธ ์„ค์ •
427
  label="์‚ฌ์šฉํ•  LLM ๋ชจ๋ธ ์„ ํƒ",
428
  )
429
 
@@ -439,7 +504,7 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
439
  num_tokens = gr.Slider(
440
  50,
441
  2000,
442
- 1000, # ๊ธฐ๋ณธ๊ฐ’ ์ถ•์†Œ
443
  step=50,
444
  label="์ถ”๋ก  ๋‹จ๊ณ„๋‹น ์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
445
  interactive=True,
@@ -447,7 +512,7 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
447
  final_num_tokens = gr.Slider(
448
  50,
449
  3000,
450
- 1500, # ๊ธฐ๋ณธ๊ฐ’ ์ถ•์†Œ
451
  step=50,
452
  label="์ตœ์ข… ๋‹ต๋ณ€์˜ ์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
453
  interactive=True,
@@ -455,19 +520,12 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
455
  do_sample = gr.Checkbox(True, label="์ƒ˜ํ”Œ๋ง ์‚ฌ์šฉ")
456
  temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="์˜จ๋„")
457
 
458
- # ์ž๋™ ๋ชจ๋ธ ๋กœ๋“œ ๊ธฐ๋Šฅ ์ถ”๊ฐ€
459
- def auto_load_model():
460
- # ์ฒซ ๋ฒˆ์งธ ๋ชจ๋ธ ์ž๋™ ๋กœ๋“œ
461
- model_key = DEFAULT_MODEL_KEY
462
- try:
463
- result = load_model([model_key])
464
- return result
465
- except Exception as e:
466
- return f"์ž๋™ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {str(e)}"
467
-
468
- # ์‹œ์ž‘ ์‹œ ์ž๋™์œผ๋กœ ๋ชจ๋ธ ๋กœ๋“œ (์ŠคํŽ˜์ด์Šค๊ฐ€ ์‹œ์ž‘๋  ๋•Œ)
469
  demo.load(auto_load_model, [], [model_status])
470
 
 
 
 
471
  # ์„ ํƒ๋œ ๋ชจ๋ธ ๋กœ๋“œ ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
472
  def get_model_names(selected_model):
473
  # ํ‘œ์‹œ ์ด๋ฆ„์—์„œ ์›๋ž˜ ๋ชจ๋ธ ์ด๋ฆ„์œผ๋กœ ๋ณ€ํ™˜
 
3
  import gc
4
  import os
5
  import torch
6
+ import time
7
+ import signal
8
  import gradio as gr
9
  import spaces
10
  import transformers
 
14
  # ๋ชจ๋ธ ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ ๋ฐ ์ตœ์ ํ™”๋ฅผ ์œ„ํ•œ ์„ค์ •
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
  DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
17
+ MAX_GPU_MEMORY = 80 * 1024 * 1024 * 1024 # 80GB A100 ๊ธฐ์ค€
18
 
19
+ # ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ชจ๋ธ ๋ชฉ๋ก - ๋” ์ž‘์€ ๋ชจ๋ธ๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋„๋ก ๋ณ€๊ฒฝ
20
  available_models = {
21
+ "google/gemma-2b": "Google Gemma (2B)", # ๋” ์ž‘์€ ๋ชจ๋ธ์„ ๊ธฐ๋ณธ์œผ๋กœ ์„ค์ •
22
+ "mistralai/Mistral-7B-Instruct-v0.2": "Mistral 7B Instruct v0.2",
23
  "mistralai/Mistral-Small-3.1-24B-Base-2503": "Mistral Small 3.1 (24B)",
 
24
  "google/gemma-3-27b-it": "Google Gemma 3 (27B)",
25
  "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen 2.5 Coder (32B)",
26
  "open-r1/OlympicCoder-32B": "Olympic Coder (32B)"
27
  }
28
 
29
+ # ๊ธฐ๋ณธ ๋ชจ๋ธ - ๊ฐ€์žฅ ์ž‘์€ ๋ชจ๋ธ๋กœ ์„ค์ •
30
  DEFAULT_MODEL_KEY = list(available_models.keys())[0]
31
  DEFAULT_MODEL_VALUE = available_models[DEFAULT_MODEL_KEY]
32
 
33
  # ๋ชจ๋ธ ๋กœ๋“œ์— ์‚ฌ์šฉ๋˜๋Š” ์ „์—ญ ๋ณ€์ˆ˜
34
  pipe = None
35
  current_model_name = None
36
+ loading_in_progress = False
37
 
38
  # Hugging Face ํ† ํฐ์œผ๋กœ ๋กœ๊ทธ์ธ ์‹œ๋„
39
  try:
 
73
  # ๋ชจ๋ธ ํฌ๊ธฐ ๊ธฐ๋ฐ˜ ๊ตฌ์„ฑ - ๋ชจ๋ธ ํฌ๊ธฐ์— ๋”ฐ๋ฅธ ์ตœ์  ์„ค์ • ์ •์˜
74
  MODEL_CONFIG = {
75
  "small": { # <10B
76
+ "max_memory": {0: "10GiB"},
77
  "offload": False,
78
  "quantization": None
79
  },
80
  "medium": { # 10B-30B
81
+ "max_memory": {0: "30GiB"},
82
  "offload": False,
83
+ "quantization": None
84
  },
85
  "large": { # >30B
86
+ "max_memory": {0: "60GiB"},
87
  "offload": True,
88
+ "quantization": None
89
  }
90
  }
91
 
92
  def get_model_size_category(model_name):
93
  """๋ชจ๋ธ ํฌ๊ธฐ ์นดํ…Œ๊ณ ๋ฆฌ ๊ฒฐ์ •"""
94
+ if "2B" in model_name or "3B" in model_name or "7B" in model_name or "8B" in model_name:
95
  return "small"
96
+ elif "15B" in model_name or "24B" in model_name or "27B" in model_name:
97
  return "medium"
98
  elif "32B" in model_name or "70B" in model_name:
99
  return "large"
100
  else:
101
+ # ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ small ๋ฐ˜ํ™˜ (์•ˆ์ „์„ ์œ„ํ•ด)
102
+ return "small"
103
 
104
  def clear_gpu_memory():
105
  """GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ"""
 
141
  messages.append({"role": h.role, "content": h.content})
142
  return messages
143
 
144
+ def load_model(model_names, status_callback=None):
145
  """์„ ํƒ๋œ ๋ชจ๋ธ ์ด๋ฆ„์— ๋”ฐ๋ผ ๋ชจ๋ธ ๋กœ๋“œ (A100์— ์ตœ์ ํ™”๋œ ์„ค์ • ์‚ฌ์šฉ)"""
146
+ global pipe, current_model_name, loading_in_progress
 
 
 
147
 
148
+ # ์ด๋ฏธ ๋กœ๋”ฉ ์ค‘์ธ ๊ฒฝ์šฐ
149
+ if loading_in_progress:
150
+ return "๋‹ค๋ฅธ ๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ ์ค‘์ž…๋‹ˆ๋‹ค. ์ž ์‹œ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”."
 
 
 
151
 
152
+ loading_in_progress = True
 
 
153
 
 
154
  try:
155
+ # ๊ธฐ์กด ๋ชจ๋ธ ์ •๋ฆฌ
156
+ clear_gpu_memory()
157
+
158
+ # ๋ชจ๋ธ์ด ์„ ํƒ๋˜์ง€ ์•Š์•˜์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’ ์ง€์ •
159
+ if not model_names:
160
+ model_name = DEFAULT_MODEL_KEY
161
+ else:
162
+ # ์ฒซ ๋ฒˆ์งธ ์„ ํƒ๋œ ๋ชจ๋ธ ์‚ฌ์šฉ
163
+ model_name = model_names[0]
164
+
165
+ # ๋ชจ๋ธ ํฌ๊ธฐ ์นดํ…Œ๊ณ ๋ฆฌ ํ™•์ธ
166
+ size_category = get_model_size_category(model_name)
167
+ config = MODEL_CONFIG[size_category]
168
+
169
+ # ๋กœ๋”ฉ ์ƒํƒœ ์—…๋ฐ์ดํŠธ
170
+ if status_callback:
171
+ status_callback(f"๋ชจ๋ธ '{model_name}' ๋กœ๋“œ ์ค‘... (ํฌ๊ธฐ: {size_category})")
172
+
173
+ # ๋ชจ๋ธ ๋กœ๋“œ (ํฌ๊ธฐ์— ๋”ฐ๋ผ ์ตœ์ ํ™”๋œ ์„ค์ • ์ ์šฉ)
174
  # HF_TOKEN ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ
175
  hf_token = os.getenv("HF_TOKEN")
176
  # ๊ณตํ†ต ๋งค๊ฐœ๋ณ€์ˆ˜
 
179
  "trust_remote_code": True,
180
  }
181
 
182
+ # BitsAndBytes ์‚ฌ์šฉ ์—ฌ๋ถ€ ํ™•์ธ
183
  try:
184
  import bitsandbytes
185
  has_bitsandbytes = True
 
186
  except ImportError:
187
  has_bitsandbytes = False
188
+ if status_callback:
189
+ status_callback(f"BitsAndBytes ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์–‘์žํ™” ์—†์ด ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.")
190
+
191
+ # ์‹œ๊ฐ„ ์ œํ•œ ์„ค์ • (๋ชจ๋ธ ํฌ๊ธฐ์— ๋”ฐ๋ผ ๋‹ค๋ฅด๊ฒŒ)
192
+ if size_category == "small":
193
+ load_timeout = 180 # 3๋ถ„
194
+ elif size_category == "medium":
195
+ load_timeout = 300 # 5๋ถ„
196
+ else:
197
+ load_timeout = 600 # 10๋ถ„
198
+
199
+ # ๋กœ๋”ฉ ์‹œ์ž‘ ์‹œ๊ฐ„
200
+ start_time = time.time()
201
 
202
  # ์–‘์žํ™” ์„ค์ •์ด ํ•„์š”ํ•˜๊ณ  BitsAndBytes๋ฅผ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๊ฒฝ์šฐ
203
  if config["quantization"] and has_bitsandbytes:
 
208
  bnb_4bit_compute_dtype=DTYPE
209
  )
210
 
211
+ if status_callback:
212
+ status_callback(f"๋ชจ๋ธ '{model_name}' ๋กœ๋“œ ์ค‘... (์–‘์žํ™” ์ ์šฉ)")
213
+
214
  model = AutoModelForCausalLM.from_pretrained(
215
  model_name,
216
  device_map="auto",
 
231
  )
232
  else:
233
  # ์–‘์žํ™” ์—†์ด ๋กœ๋“œ
234
+ if status_callback:
235
+ status_callback(f"๋ชจ๋ธ '{model_name}' ๋กœ๋“œ ์ค‘... (ํ‘œ์ค€ ๋ฐฉ์‹)")
236
+
237
  pipe = pipeline(
238
  "text-generation",
239
  model=model_name,
 
242
  **common_params
243
  )
244
 
245
+ # ์‹œ๊ฐ„ ์ œํ•œ ์ดˆ๊ณผ ํ™•์ธ
246
+ elapsed_time = time.time() - start_time
247
+ if elapsed_time > load_timeout:
248
+ clear_gpu_memory()
249
+ loading_in_progress = False
250
+ return f"๋ชจ๋ธ ๋กœ๋“œ ์‹œ๊ฐ„ ์ดˆ๊ณผ: {load_timeout}์ดˆ๊ฐ€ ์ง€๋‚ฌ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•˜์„ธ์š”."
251
+
252
  current_model_name = model_name
253
+ loading_in_progress = False
254
+ return f"๋ชจ๋ธ '{model_name}'์ด(๊ฐ€) ์„ฑ๊ณต์ ์œผ๋กœ ๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. (์ตœ์ ํ™”: {size_category}, ์†Œ์š”์‹œ๊ฐ„: {elapsed_time:.1f}์ดˆ)"
255
 
256
  except Exception as e:
257
+ loading_in_progress = False
258
  return f"๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {str(e)}"
259
 
260
  @spaces.GPU
 
311
  messages = rebuild_messages(history)
312
 
313
  # ํƒ€์ž„์•„์›ƒ ์„ค์ •
 
 
314
  class TimeoutError(Exception):
315
  pass
316
 
 
385
  continue
386
 
387
  # ์ตœ๋Œ€ 30์ดˆ ๋Œ€๊ธฐ ํ›„ ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ง„ํ–‰
 
388
  join_start_time = time.time()
389
  while t.is_alive() and (time.time() - join_start_time) < 30:
390
  t.join(1) # 1์ดˆ๋งˆ๋‹ค ํ™•์ธ
 
426
 
427
  return "\n".join(gpu_info)
428
 
429
+ # ์ž๋™ ๋ชจ๋ธ ๋กœ๋“œ ํ•จ์ˆ˜ (์ƒํƒœ ์—…๋ฐ์ดํŠธ ํฌํ•จ)
430
+ def auto_load_model():
431
+ # ์ฒซ ๋ฒˆ์งธ ๋ชจ๋ธ ์ž๋™ ๋กœ๋“œ
432
+ model_key = DEFAULT_MODEL_KEY
433
+ try:
434
+ # ์ง„ํ–‰ ์ƒํƒœ ํ‘œ์‹œ๋ฅผ ์œ„ํ•œ ๋นˆ ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
435
+ return "์ž‘์€ ๋ชจ๋ธ ์ž๋™ ๋กœ๋“œ ์ค‘... ์ž ์‹œ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”."
436
+ except Exception as e:
437
+ return f"์ž๋™ ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {str(e)}"
438
+
439
+ # ์‹ค์ œ ๋ชจ๋ธ ๋กœ๋“œ ํ•จ์ˆ˜ (๋น„๋™๊ธฐ)
440
+ def load_model_async(model_status):
441
+ # ๋น„๋™๊ธฐ ํ•จ์ˆ˜๋กœ ๋ชจ๋ธ ๋กœ๋“œ (์‹ค์ œ ๋กœ๋“œ๋Š” ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์ˆ˜ํ–‰)
442
+ model_key = DEFAULT_MODEL_KEY
443
+
444
+ def update_status(status):
445
+ model_status.update(value=status)
446
+
447
+ # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ๋กœ๋“œ
448
+ def load_in_thread():
449
+ try:
450
+ result = load_model([model_key], update_status)
451
+ model_status.update(value=result)
452
+ except Exception as e:
453
+ model_status.update(value=f"๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
454
+
455
+ threading.Thread(target=load_in_thread, daemon=True).start()
456
+ return "๋ชจ๋ธ ๋กœ๋“œ ์ค€๋น„ ์ค‘... ์ž๋™์œผ๋กœ ์ง„ํ–‰๋ฉ๋‹ˆ๋‹ค."
457
+
458
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
459
  with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Service") as demo:
460
  # ์ƒ๋‹จ์— ํƒ€์ดํ‹€๊ณผ ์„ค๋ช… ์ถ”๊ฐ€
 
488
  gr.Markdown("""## ๋ชจ๋ธ ์„ ํƒ""")
489
  model_selector = gr.Radio(
490
  choices=list(available_models.values()),
491
+ value=DEFAULT_MODEL_VALUE,
492
  label="์‚ฌ์šฉํ•  LLM ๋ชจ๋ธ ์„ ํƒ",
493
  )
494
 
 
504
  num_tokens = gr.Slider(
505
  50,
506
  2000,
507
+ 1000,
508
  step=50,
509
  label="์ถ”๋ก  ๋‹จ๊ณ„๋‹น ์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
510
  interactive=True,
 
512
  final_num_tokens = gr.Slider(
513
  50,
514
  3000,
515
+ 1500,
516
  step=50,
517
  label="์ตœ์ข… ๋‹ต๋ณ€์˜ ์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
518
  interactive=True,
 
520
  do_sample = gr.Checkbox(True, label="์ƒ˜ํ”Œ๋ง ์‚ฌ์šฉ")
521
  temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="์˜จ๋„")
522
 
523
+ # ์‹œ์ž‘ ์‹œ ์ž๋™์œผ๋กœ ์ดˆ๊ธฐํ™”
 
 
 
 
 
 
 
 
 
 
524
  demo.load(auto_load_model, [], [model_status])
525
 
526
+ # ์‹œ์ž‘ ํ›„ ๋น„๋™๊ธฐ์ ์œผ๋กœ ๋ชจ๋ธ ๋กœ๋“œ (์ดˆ๊ธฐ ํ™”๋ฉด ํ‘œ์‹œ ์ง€์—ฐ ๋ฐฉ์ง€)
527
+ demo.load(lambda x: load_model_async(x), [model_status], [], _js="() => {}")
528
+
529
  # ์„ ํƒ๋œ ๋ชจ๋ธ ๋กœ๋“œ ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
530
  def get_model_names(selected_model):
531
  # ํ‘œ์‹œ ์ด๋ฆ„์—์„œ ์›๋ž˜ ๋ชจ๋ธ ์ด๋ฆ„์œผ๋กœ ๋ณ€ํ™˜