Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 22 days ago

Commit

644faa4

1 Parent(s): 625e096

more claude not helping

Browse files

Files changed (1) hide show

app.py +17 -50

app.py CHANGED Viewed

@@ -320,15 +320,15 @@ def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, s
                 msgs.append({"role": "assistant", "content": str(a)})
     return msgs
-def generate_response_streaming(message: Any, history: List[Any], system_prompt: str,
-                               temperature: float, top_p: float, top_k: int, max_new_tokens: int,
-                               do_sample: bool, seed: Optional[int],
-                               rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
-                               rose_tokens: str, rose_json: str,
-                               show_thinking: bool = False) -> Iterator[str]:
     """
-    Streaming generator for ChatInterface.
-    Yields partial responses to avoid h11 Content-Length issues.
     """
     try:
         # Normalize message and build Harmony prompt
@@ -338,9 +338,6 @@ def generate_response_streaming(message: Any, history: List[Any], system_prompt:
         msgs = chat_to_messages(history, system_prompt)
         msgs.append({"role": "user", "content": str(message)})
-        # Yield initial status
-        yield "🤔 Preparing prompt..."
         prompt = to_harmony_prompt(msgs)
         # Build Rose map if enabled
@@ -369,9 +366,6 @@ def generate_response_streaming(message: Any, history: List[Any], system_prompt:
                     pass
             if not rose_map:
                 rose_map = None
-        # Update status
-        yield "💭 Generating response..."
         # Generate with model
         full_output = zerogpu_generate(
@@ -392,19 +386,16 @@ def generate_response_streaming(message: Any, history: List[Any], system_prompt:
         # Extract final response from CoT output
         if show_thinking:
             # Show the full chain-of-thought process
-            final_response = f"**Full Output (with thinking):**\n```\n{full_output}\n```\n\n**Final Response:**\n{extract_final_channel(full_output)}"
         else:
             # Just show the final response
-            final_response = extract_final_channel(full_output)
-        # Yield the final response
-        yield final_response
     except Exception as e:
         error_msg = f"⚠️ Error: {str(e)}"
-        print(f"[Error in generate_response_streaming] {error_msg}")
         print(traceback.format_exc())
-        yield error_msg
 # -----------------------
 # UI
@@ -419,37 +410,16 @@ css = """
 """
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
-    # OAuth status tracking
-    login_status = gr.State(value=None)
     gr.Markdown(
         """
         # Mirel – Harmony Inference (ZeroGPU-ready)
         Chain-of-thought OSS-20B model with Harmony formatting.
         The model thinks through problems internally before providing a final response.
         """
     )
-    # Add OAuth login button for browser-based auth
-    with gr.Row():
-        with gr.Column(scale=1):
-            login_btn = gr.LoginButton(value="Sign in with Hugging Face", size="sm")
-        with gr.Column(scale=3):
-            auth_status = gr.Markdown("Not signed in - using default access")
-    # OAuth handler to get user profile
-    def update_auth_status(profile: gr.OAuthProfile | None) -> str:
-        if profile:
-            return f"✅ Signed in as **{profile.name}** (username: {profile.username})"
-        else:
-            if HF_TOKEN:
-                return "✅ Using token from environment"
-            else:
-                return "Not signed in - using default access"
-    # Update auth status on load
-    demo.load(update_auth_status, inputs=None, outputs=auth_status)
     with gr.Row():
         with gr.Column(scale=3):
@@ -494,7 +464,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     # Chat interface
     chat = gr.ChatInterface(
-        fn=generate_response_streaming,
         chatbot=gr.Chatbot(elem_id="chatbot", height=500, type="messages"),
         additional_inputs=[
             system_prompt, temperature, top_p, top_k, max_new,
@@ -504,7 +474,6 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
         title=None,  # Title already in markdown
         description=None,  # Description already in markdown
         cache_examples=False,
-        analytics_enabled=False,
     )
     gr.Markdown(
@@ -531,13 +500,11 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     )
 if __name__ == "__main__":
-    # Configure queue for better performance
     demo.queue(
-        max_size=10 if ZEROGPU else 50,
-        default_concurrency_limit=1 if ZEROGPU else 2  # Limit concurrent requests
     ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
-        max_threads=40,  # Default FastAPI thread pool size
     )

                 msgs.append({"role": "assistant", "content": str(a)})
     return msgs
+def generate_response(message: Any, history: List[Any], system_prompt: str,
+                      temperature: float, top_p: float, top_k: int, max_new_tokens: int,
+                      do_sample: bool, seed: Optional[int],
+                      rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
+                      rose_tokens: str, rose_json: str,
+                      show_thinking: bool = False) -> str:
     """
+    Non-streaming response generator for ChatInterface.
+    Returns a complete response to avoid h11 Content-Length issues.
     """
     try:
         # Normalize message and build Harmony prompt
         msgs = chat_to_messages(history, system_prompt)
         msgs.append({"role": "user", "content": str(message)})
         prompt = to_harmony_prompt(msgs)
         # Build Rose map if enabled
                     pass
             if not rose_map:
                 rose_map = None
         # Generate with model
         full_output = zerogpu_generate(
         # Extract final response from CoT output
         if show_thinking:
             # Show the full chain-of-thought process
+            return f"**Full Output (with thinking):**\n```\n{full_output}\n```\n\n**Final Response:**\n{extract_final_channel(full_output)}"
         else:
             # Just show the final response
+            return extract_final_channel(full_output)
     except Exception as e:
         error_msg = f"⚠️ Error: {str(e)}"
+        print(f"[Error in generate_response] {error_msg}")
         print(traceback.format_exc())
+        return error_msg
 # -----------------------
 # UI
 """
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown(
         """
         # Mirel – Harmony Inference (ZeroGPU-ready)
         Chain-of-thought OSS-20B model with Harmony formatting.
         The model thinks through problems internally before providing a final response.
+        **Auth:** Set `HF_TOKEN` in Space secrets or add `hf_oauth: true` to README for browser auth.
         """
     )
     with gr.Row():
         with gr.Column(scale=3):
     # Chat interface
     chat = gr.ChatInterface(
+        fn=generate_response,
         chatbot=gr.Chatbot(elem_id="chatbot", height=500, type="messages"),
         additional_inputs=[
             system_prompt, temperature, top_p, top_k, max_new,
         title=None,  # Title already in markdown
         description=None,  # Description already in markdown
         cache_examples=False,
     )
     gr.Markdown(
     )
 if __name__ == "__main__":
+    # Simple queue configuration
     demo.queue(
+        max_size=10,
     ).launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
     )