Spaces:

VIDraft
/

ThinkFlow-llama

Running on Zero

App Files Files Community

openfree commited on Mar 24

Commit

5969407

verified ·

1 Parent(s): 60f9305

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -36

app.py CHANGED Viewed

@@ -234,43 +234,55 @@ def bot(
         history.append(
             gr.ChatMessage(
                 role="assistant",
-                content="모델이 로드되지 않았습니다. 하나 이상의 모델을 선택해 주세요.",
             )
         )
         yield history
         return
-    # 토큰 길이 자동 조정 (모델 크기에 따라)
-    size_category = get_model_size_category(current_model_name)
-    # 대형 모델은 토큰 수를 줄여 메모리 효율성 향상
-    if size_category == "large":
-        max_num_tokens = min(max_num_tokens, 1000)
-        final_num_tokens = min(final_num_tokens, 1500)
-    # 나중에 스레드에서 토큰을 스트림으로 가져오기 위함
-    streamer = transformers.TextIteratorStreamer(
-        pipe.tokenizer,
-        skip_special_tokens=True,
-        skip_prompt=True,
-    )
-    # 필요한 경우 추론에 질문을 다시 삽입하기 위함
-    question = history[-1]["content"]
-    # 보조자 메시지 준비
-    history.append(
-        gr.ChatMessage(
-            role="assistant",
-            content=str(""),
-            metadata={"title": "🧠 생각 중...", "status": "pending"},
         )
-    )
-    # 현재 채팅에 표시될 추론 과정
-    messages = rebuild_messages(history)
-    try:
         for i, prepend in enumerate(rethink_prepends):
             if i > 0:
                 messages[-1]["content"] += "\n\n"
@@ -294,6 +306,7 @@ def bot(
                     use_cache=True,  # KV 캐시 사용
                 ),
             )
             t.start()
             # 새 내용으로 히스토리 재구성
@@ -302,14 +315,48 @@ def bot(
                 history[-1].metadata = {"title": "💭 사고 과정", "status": "done"}
                 # 생각 종료, 이제 답변입니다 (중간 단계에 대한 메타데이터 없음)
                 history.append(gr.ChatMessage(role="assistant", content=""))
-            # 토큰 스트리밍
-            for token in streamer:
-                history[-1].content += token
-                history[-1].content = reformat_math(history[-1].content)
                 yield history
-            t.join()
             # 대형 모델인 경우 각 단계 후 부분적 메모리 정리
             if size_category == "large" and torch.cuda.is_available():
@@ -317,9 +364,15 @@ def bot(
     except Exception as e:
         # 오류 발생시 사용자에게 알림
-        if len(history) > 0 and history[-1].role == "assistant":
-            history[-1].content += f"\n\n⚠️ 처리 중 오류가 발생했습니다: {str(e)}"
-            yield history
     yield history
@@ -402,6 +455,19 @@ with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Servi
                 do_sample = gr.Checkbox(True, label="샘플링 사용")
                 temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="온도")
     # 선택된 모델 로드 이벤트 연결
     def get_model_names(selected_model):
         # 표시 이름에서 원래 모델 이름으로 변환

         history.append(
             gr.ChatMessage(
                 role="assistant",
+                content="모델이 로드되지 않았습니다. 하나 이상의 모델을 선택하고 '모델 로드' 버튼을 클릭해 주세요.",
             )
         )
         yield history
         return
+    try:
+        # 토큰 길이 자동 조정 (모델 크기에 따라)
+        size_category = get_model_size_category(current_model_name)
+        # 대형 모델은 토큰 수를 줄여 메모리 효율성 향상
+        if size_category == "large":
+            max_num_tokens = min(max_num_tokens, 1000)
+            final_num_tokens = min(final_num_tokens, 1500)
+        # 나중에 스레드에서 토큰을 스트림으로 가져오기 위함
+        streamer = transformers.TextIteratorStreamer(
+            pipe.tokenizer,
+            skip_special_tokens=True,
+            skip_prompt=True,
+        )
+        # 필요한 경우 추론에 질문을 다시 삽입하기 위함
+        question = history[-1]["content"]
+        # 보조자 메시지 준비
+        history.append(
+            gr.ChatMessage(
+                role="assistant",
+                content=str(""),
+                metadata={"title": "🧠 생각 중...", "status": "pending"},
+            )
         )
+        # 현재 채팅에 표시될 추론 과정
+        messages = rebuild_messages(history)
+        # 타임아웃 설정
+        import signal
+        class TimeoutError(Exception):
+            pass
+        def timeout_handler(signum, frame):
+            raise TimeoutError("요청 처리 시간이 초과되었습니다.")
+        # 각 단계마다 최대 120초 타임아웃 설정
+        timeout_seconds = 120
         for i, prepend in enumerate(rethink_prepends):
             if i > 0:
                 messages[-1]["content"] += "\n\n"
                     use_cache=True,  # KV 캐시 사용
                 ),
             )
+            t.daemon = True  # 데몬 스레드로 설정하여 메인 스레드가 종료되면 함께 종료
             t.start()
             # 새 내용으로 히스토리 재구성
                 history[-1].metadata = {"title": "💭 사고 과정", "status": "done"}
                 # 생각 종료, 이제 답변입니다 (중간 단계에 대한 메타데이터 없음)
                 history.append(gr.ChatMessage(role="assistant", content=""))
+            # 타임아웃 설정 (Unix 시스템에서만 작동)
+            try:
+                if hasattr(signal, 'SIGALRM'):
+                    signal.signal(signal.SIGALRM, timeout_handler)
+                    signal.alarm(timeout_seconds)
+                # 토큰 스트리밍
+                token_count = 0
+                for token in streamer:
+                    history[-1].content += token
+                    history[-1].content = reformat_math(history[-1].content)
+                    token_count += 1
+                    # 10개 토큰마다 yield (UI 응답성 향상)
+                    if token_count % 10 == 0:
+                        yield history
+                # 남은 내용 yield
                 yield history
+                # 타임아웃 해제
+                if hasattr(signal, 'SIGALRM'):
+                    signal.alarm(0)
+            except TimeoutError:
+                if hasattr(signal, 'SIGALRM'):
+                    signal.alarm(0)
+                history[-1].content += "\n\n⚠️ 응답 생�� 시간이 초과되었습니다. 다음 단계로 진행합니다."
+                yield history
+                continue
+            # 최대 30초 대기 후 다음 단계로 진행
+            import time
+            join_start_time = time.time()
+            while t.is_alive() and (time.time() - join_start_time) < 30:
+                t.join(1)  # 1초마다 확인
+            # 스레드가 여전히 실행 중이면 강제 진행
+            if t.is_alive():
+                history[-1].content += "\n\n⚠️ 응답 생성이 예상보다 오래 걸립니다. 다음 단계로 진행합니다."
+                yield history
             # 대형 모델인 경우 각 단계 후 부분적 메모리 정리
             if size_category == "large" and torch.cuda.is_available():
     except Exception as e:
         # 오류 발생시 사용자에게 알림
+        import traceback
+        error_msg = f"\n\n⚠️ 처리 중 오류가 발생했습니다: {str(e)}\n{traceback.format_exc()}"
+        if len(history) > 0 and isinstance(history[-1], gr.ChatMessage) and history[-1].role == "assistant":
+            history[-1].content += error_msg
+        else:
+            history.append(gr.ChatMessage(role="assistant", content=error_msg))
+        yield history
     yield history
                 do_sample = gr.Checkbox(True, label="샘플링 사용")
                 temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="온도")
+    # 자동 모델 로드 기능 추가
+    def auto_load_model():
+        # 첫 번째 모델 자동 로드
+        model_key = DEFAULT_MODEL_KEY
+        try:
+            result = load_model([model_key])
+            return result
+        except Exception as e:
+            return f"자동 모델 로드 실패: {str(e)}"
+    # 시작 시 자동으로 모델 로드 (스페이스가 시작될 때)
+    demo.load(auto_load_model, [], [model_status])
     # 선택된 모델 로드 이벤트 연결
     def get_model_names(selected_model):
         # 표시 이름에서 원래 모델 이름으로 변환