Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 21 days ago

Commit

d76bf3e

1 Parent(s): 4fc7c90

yes

Browse files

Files changed (1) hide show

app.py +457 -301

app.py CHANGED Viewed

@@ -1,376 +1,532 @@
 """
 Mirel Harmony Inference – HF Space (Gradio)
-Simplified version with robust error handling
 """
-import os
-import gc
-import json
-import torch
 import gradio as gr
-from typing import List, Dict, Optional, Any, Generator
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Check if spaces is available
 try:
-    import spaces
-    SPACES_AVAILABLE = True
 except ImportError:
-    SPACES_AVAILABLE = False
-    print("[WARNING] spaces not available, running without ZeroGPU")
 # -----------------------
-# Config
 # -----------------------
-MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
-ADAPTER_ID = os.getenv("ADAPTER_ID")
-ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER")
-SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "You are Mirel, a helpful assistant.")
-MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512"))
-DTYPE = os.getenv("DTYPE", "bf16")
-ZEROGPU = os.getenv("ZEROGPU", "0") == "1"
-# HF Token
-HF_TOKEN = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 )
-if HF_TOKEN:
-    try:
-        from huggingface_hub import login
-        login(token=HF_TOKEN)
-        print("[Auth] Logged in to Hugging Face")
-    except Exception as e:
-        print(f"[Auth] Failed to login: {e}")
 # -----------------------
-# Model Loading
 # -----------------------
-print(f"[Model] Loading tokenizer from {MODEL_ID}")
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    token=HF_TOKEN
-)
-model = None
-def get_dtype():
-    """Get the appropriate dtype for the model."""
-    if DTYPE == "bf16" and torch.cuda.is_available():
-        return torch.bfloat16
-    elif DTYPE == "fp16":
-        return torch.float16
-    else:
-        return torch.float32
-def load_model():
-    """Load the model (called inside GPU context if using ZeroGPU)."""
-    global model
-    if model is None:
-        print(f"[Model] Loading model from {MODEL_ID}")
-        kwargs = {
-            "torch_dtype": get_dtype(),
-            "device_map": "auto" if torch.cuda.is_available() else "cpu",
-            "trust_remote_code": True,
-            "token": HF_TOKEN,
-            "low_cpu_mem_usage": True,
-        }
-        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **kwargs)
-        # Load adapter if specified
-        if ADAPTER_ID:
-            try:
-                from peft import PeftModel
-                print(f"[Model] Loading adapter from {ADAPTER_ID}")
-                adapter_kwargs = {"token": HF_TOKEN}
-                if ADAPTER_SUBFOLDER:
-                    adapter_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-                model = PeftModel.from_pretrained(
-                    model,
-                    ADAPTER_ID,
-                    is_trainable=False,
-                    **adapter_kwargs
-                )
-            except ImportError:
-                print("[WARNING] PEFT not installed, skipping adapter")
-            except Exception as e:
-                print(f"[WARNING] Failed to load adapter: {e}")
-        model.eval()
     return model
-def extract_final_response(text: str) -> str:
-    """Extract the final channel from chain-of-thought output."""
-    # Look for final channel marker
     final_marker = "<|channel|>final<|message|>"
     if final_marker in text:
         parts = text.split(final_marker)
         if len(parts) > 1:
             final_text = parts[-1]
-            # Clean end markers
-            for marker in ["<|return|>", "<|end|>", "<|endoftext|>"]:
                 if marker in final_text:
                     final_text = final_text.split(marker)[0]
             return final_text.strip()
-    # No channel markers, return cleaned text
     return text.strip()
 # -----------------------
-# Generation Function
 # -----------------------
-def generate_text(
-    prompt: str,
-    temperature: float = 0.7,
-    top_p: float = 0.9,
-    top_k: int = 0,
-    max_new_tokens: int = 512,
-    do_sample: bool = True,
-) -> str:
-    """Generate text using the model."""
     try:
-        # Load/get model
-        model_instance = load_model()
-        # Tokenize
-        inputs = tokenizer(prompt, return_tensors="pt")
-        if torch.cuda.is_available():
-            inputs = inputs.to("cuda")
         # Generate
-        with torch.no_grad():
-            outputs = model_instance.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k if top_k > 0 else None,
-                do_sample=do_sample,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-            )
-        # Decode
-        prompt_len = inputs["input_ids"].shape[1]
-        generated_ids = outputs[0][prompt_len:]
-        response = tokenizer.decode(generated_ids, skip_special_tokens=False)
-        return response
     except Exception as e:
-        error_msg = f"Generation error: {str(e)}"
-        print(f"[ERROR] {error_msg}")
-        return error_msg
     finally:
         # Cleanup
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        gc.collect()
-# Add GPU decorator if available
-if SPACES_AVAILABLE and ZEROGPU:
-    generate_text = spaces.GPU(duration=120)(generate_text)
 # -----------------------
-# Chat Function
 # -----------------------
-def chat_fn(
-    message: str,
-    history: List[List[str]],
-    system_prompt: str,
-    temperature: float,
-    top_p: float,
-    top_k: int,
-    max_new_tokens: int,
-    do_sample: bool,
-    show_thinking: bool,
-) -> str:
-    """Main chat function for Gradio."""
     try:
-        # Build conversation
-        messages = [{"role": "system", "content": system_prompt or SYSTEM_PROMPT}]
-        for user_msg, assistant_msg in (history or []):
-            if user_msg:
-                messages.append({"role": "user", "content": user_msg})
-            if assistant_msg:
-                messages.append({"role": "assistant", "content": assistant_msg})
-        messages.append({"role": "user", "content": message})
-        # Apply chat template
-        try:
-            prompt = tokenizer.apply_chat_template(
-                messages,
-                add_generation_prompt=True,
-                tokenize=False
-            )
-        except Exception:
-            # Fallback to simple format
-            prompt = f"{system_prompt}\n\n"
-            for msg in messages[1:]:
-                role = msg["role"].upper()
-                content = msg["content"]
-                prompt += f"{role}: {content}\n"
-            prompt += "ASSISTANT: "
-        # Generate response
-        full_response = generate_text(
-            prompt=prompt,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=int(top_k),
-            max_new_tokens=int(max_new_tokens),
-            do_sample=do_sample,
         )
-        # Process response
         if show_thinking:
-            # Show full output with channels
-            final = extract_final_response(full_response)
-            return f"**Full Output:**\n```\n{full_response}\n```\n\n**Final Response:**\n{final}"
         else:
-            # Just show final response
-            return extract_final_response(full_response)
     except Exception as e:
-        error_msg = f"Chat error: {str(e)}"
-        print(f"[ERROR] {error_msg}")
-        return error_msg
 # -----------------------
-# Gradio Interface
 # -----------------------
-def create_interface():
-    """Create the Gradio interface."""
-    with gr.Blocks(title="Mirel Chat") as demo:
-        gr.Markdown(
-            """
-            # Mirel - Chain-of-Thought Chat
-            Chat with a model that thinks before responding.
-            """
-        )
-        with gr.Row():
-            with gr.Column(scale=4):
-                chatbot = gr.Chatbot(height=500)
-                msg = gr.Textbox(
-                    label="Message",
-                    placeholder="Type your message here...",
-                    lines=2
-                )
-                with gr.Row():
-                    submit = gr.Button("Send", variant="primary")
-                    clear = gr.Button("Clear")
-            with gr.Column(scale=1):
-                system_prompt = gr.Textbox(
-                    label="System Prompt",
-                    value=SYSTEM_PROMPT,
-                    lines=3
-                )
-                with gr.Accordion("Settings", open=False):
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=2.0,
-                        value=0.7,
-                        step=0.1,
-                        label="Temperature"
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.9,
-                        step=0.1,
-                        label="Top-p"
-                    )
-                    top_k = gr.Slider(
-                        minimum=0,
-                        maximum=100,
-                        value=0,
-                        step=1,
-                        label="Top-k (0=disabled)"
-                    )
-                    max_new_tokens = gr.Slider(
-                        minimum=64,
-                        maximum=2048,
-                        value=MAX_NEW_TOKENS,
-                        step=64,
-                        label="Max Tokens"
-                    )
-                    do_sample = gr.Checkbox(
-                        value=True,
-                        label="Do Sample"
-                    )
-                    show_thinking = gr.Checkbox(
-                        value=False,
-                        label="Show Thinking Process"
-                    )
-        # Event handlers
-        def user_submit(message, history):
-            return "", history + [[message, None]]
-        def bot_respond(history, system, temp, top_p, top_k, max_tokens, sample, thinking):
-            if not history or not history[-1][0]:
-                return history
-            user_message = history[-1][0]
-            bot_message = chat_fn(
-                user_message,
-                history[:-1],  # Don't include current turn
-                system,
-                temp,
-                top_p,
-                top_k,
-                max_tokens,
-                sample,
-                thinking
             )
-            history[-1][1] = bot_message
-            return history
-        msg.submit(
-            user_submit,
-            [msg, chatbot],
-            [msg, chatbot],
-            queue=False
-        ).then(
-            bot_respond,
-            [chatbot, system_prompt, temperature, top_p, top_k, max_new_tokens, do_sample, show_thinking],
-            chatbot
         )
-        submit.click(
-            user_submit,
-            [msg, chatbot],
-            [msg, chatbot],
-            queue=False
-        ).then(
-            bot_respond,
-            [chatbot, system_prompt, temperature, top_p, top_k, max_new_tokens, do_sample, show_thinking],
-            chatbot
         )
-        clear.click(lambda: None, None, chatbot, queue=False)
-    return demo
-# -----------------------
-# Main
-# -----------------------
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.queue(max_size=10)
-    demo.launch(
-        server_name="0.0.0.0",
         server_port=7860,
-        share=False
     )

 """
 Mirel Harmony Inference – HF Space (Gradio)
+ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
+Chain-of-thought model with proper channel extraction using openai_harmony
+Single file: app.py
 """
+from __future__ import annotations
+import os, gc, json, threading, torch
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Any
+from datetime import datetime
 import gradio as gr
+import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Import Harmony components
 try:
+    from openai_harmony import (
+        Author,
+        Conversation,
+        HarmonyEncodingName,
+        Message,
+        Role,
+        SystemContent,
+        DeveloperContent,
+        load_harmony_encoding,
+        ReasoningEffort
+    )
+    HARMONY_AVAILABLE = True
 except ImportError:
+    print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
+    HARMONY_AVAILABLE = False
 # -----------------------
+# Config & runtime modes
 # -----------------------
+DTYPE_MAP = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
+MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
+ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
+ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
+ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
+DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
+SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
+MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "1024"))
+ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
+LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
+# Harmony channels for CoT
+REQUIRED_CHANNELS = ["thinking", "analysis", "final"]
+# HF Auth - properly handle multiple token env var names
+HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+    or os.getenv("HF_ACCESS_TOKEN")
 )
+def _hf_login() -> None:
+    """Login to HF Hub using common env secret names."""
+    if HF_TOKEN:
+        try:
+            from huggingface_hub import login, whoami
+            login(token=HF_TOKEN, add_to_git_credential=True)
+            try:
+                who = whoami(token=HF_TOKEN)
+                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
+            except Exception:
+                print("[HF Auth] Login successful but couldn't get user info")
+        except Exception as e:
+            print(f"[HF Auth] Login failed: {e}")
+    else:
+        print("[HF Auth] No token found in environment variables")
+# Login before loading any models
+_hf_login()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Load Harmony encoding if available
+if HARMONY_AVAILABLE:
+    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+else:
+    harmony_encoding = None
+# Tokenizer is lightweight; load once
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
+except Exception as e:
+    print(f"[Model] Failed to load tokenizer: {e}")
+    raise
 # -----------------------
+# Model loading
 # -----------------------
+try:
+    from peft import PeftModel
+    _HAS_PEFT = True
+except Exception:
+    _HAS_PEFT = False
+def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
+    kw: Dict[str, Any] = dict(
+        torch_dtype=DTYPE,
+        device_map=device_map,
+        attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        token=HF_TOKEN,
+    )
+    if LOAD_4BIT and device_map != "cpu":
+        try:
+            import bitsandbytes as _bnb
+            kw.update(load_in_4bit=True)
+            if kw["device_map"] is None:
+                kw["device_map"] = "auto"
+        except Exception:
+            pass
+    return kw
+def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
+    print(f"[Model] Loading base model from {MODEL_ID}...")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
+    if ADAPTER_ID:
+        if not _HAS_PEFT:
+            raise RuntimeError("peft is required when ADAPTER_ID is set.")
+        print(f"[Model] Loading adapter from {ADAPTER_ID}...")
+        peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
+        if ADAPTER_SUBFOLDER:
+            peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
+    model.eval()
+    model.config.use_cache = True
+    print("[Model] Model loaded successfully")
     return model
+# -----------------------
+# Harmony formatting
+# -----------------------
+def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> str:
+    """Create a proper Harmony-formatted prompt using openai_harmony."""
+    if not HARMONY_AVAILABLE:
+        # Fallback to tokenizer's chat template
+        return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    # Map reasoning effort
+    effort_map = {
+        "low": ReasoningEffort.LOW,
+        "medium": ReasoningEffort.MEDIUM,
+        "high": ReasoningEffort.HIGH,
+    }
+    effort = effort_map.get(reasoning_effort.lower(), ReasoningEffort.HIGH)
+    # Create system message with channels
+    system_content = (
+        SystemContent.new()
+        .with_model_identity(messages[0]["content"] if messages else SYSTEM_DEF)
+        .with_reasoning_effort(effort)
+        .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
+        .with_knowledge_cutoff("2025-01")
+        .with_required_channels(REQUIRED_CHANNELS)
+    )
+    # Build conversation
+    harmony_messages = [
+        Message.from_role_and_content(Role.SYSTEM, system_content)
+    ]
+    # Add user/assistant messages
+    for msg in messages[1:]:  # Skip system message as we already added it
+        if msg["role"] == "user":
+            harmony_messages.append(
+                Message.from_role_and_content(Role.USER, msg["content"])
+            )
+        elif msg["role"] == "assistant":
+            # For assistant messages, we might want to preserve channels if they exist
+            harmony_messages.append(
+                Message.from_role_and_content(Role.ASSISTANT, msg["content"])
+                .with_channel("final")  # Default to final channel
+            )
+    # Create conversation and render
+    convo = Conversation.from_messages(harmony_messages)
+    tokens = harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
+    # Convert tokens back to text for the model
+    return tokenizer.decode(tokens)
+def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
+    """Parse response tokens using Harmony format to extract channels."""
+    if not HARMONY_AVAILABLE:
+        # Fallback: just decode and extract final channel manually
+        text = tokenizer.decode(tokens, skip_special_tokens=False)
+        return {"final": extract_final_channel_fallback(text), "raw": text}
+    # Parse messages from completion tokens
+    parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
+    # Extract content by channel
+    channels = {}
+    for msg in parsed_messages:
+        channel = msg.channel if hasattr(msg, 'channel') else "final"
+        if channel not in channels:
+            channels[channel] = ""
+        channels[channel] += msg.content
+    # Ensure we have a final channel
+    if "final" not in channels:
+        channels["final"] = " ".join(channels.values())
+    return channels
+def extract_final_channel_fallback(text: str) -> str:
+    """Fallback extraction when harmony library isn't available."""
+    # Look for the final channel marker
     final_marker = "<|channel|>final<|message|>"
     if final_marker in text:
         parts = text.split(final_marker)
         if len(parts) > 1:
             final_text = parts[-1]
+            # Clean up end markers
+            end_markers = ["<|return|>", "<|end|>", "<|endoftext|>"]
+            for marker in end_markers:
                 if marker in final_text:
                     final_text = final_text.split(marker)[0]
             return final_text.strip()
+    # If no channel markers found, return cleaned text
     return text.strip()
 # -----------------------
+# Rose guidance
 # -----------------------
+def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
+    """Create vocab bias from {token: weight}."""
+    vocab_size = len(tokenizer)
+    bias = torch.zeros(vocab_size, dtype=torch.float32)
+    for tok, w in mapping.items():
+        if tok is None:
+            continue
+        tid = tokenizer.convert_tokens_to_ids(tok)
+        if isinstance(tid, list):
+            for t in tid:
+                if isinstance(t, int) and t >= 0:
+                    bias[t] += float(w) / max(1, len(tid))
+        elif isinstance(tid, int) and tid >= 0:
+            bias[tid] += float(w)
+    return bias
+class RoseGuidedLogits(torch.nn.Module):
+    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
+        super().__init__()
+        self.bias_vec = bias_vec
+        self.alpha = float(alpha)
+    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        return scores + self.alpha * self.bias_vec.to(scores.device)
+@spaces.GPU(duration=120)
+def zerogpu_generate(full_prompt: str,
+                    gen_kwargs: Dict[str, Any],
+                    rose_map: Optional[Dict[str, float]],
+                    rose_alpha: float,
+                    rose_score: Optional[float],
+                    seed: Optional[int]) -> Dict[str, str]:
+    """Run inference on GPU and return parsed channels."""
     try:
+        if seed is not None:
+            torch.manual_seed(int(seed))
+        # Load model
+        model = _load_model_on("auto")
+        # Setup logits processor for Rose guidance
+        logits_processor = None
+        if rose_map:
+            bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
+            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
+            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
+        # Tokenize input
+        inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
         # Generate
+        out_ids = model.generate(
+            **inputs,
+            do_sample=bool(gen_kwargs.get("do_sample", True)),
+            temperature=float(gen_kwargs.get("temperature", 0.7)),
+            top_p=float(gen_kwargs.get("top_p", 0.9)),
+            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            logits_processor=logits_processor,
+        )
+        # Extract generated tokens only
+        prompt_len = int(inputs["input_ids"].shape[1])
+        gen_ids = out_ids[0][prompt_len:].tolist()
+        # Parse response with Harmony
+        if HARMONY_AVAILABLE:
+            channels = parse_harmony_response(gen_ids)
+        else:
+            # Fallback
+            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+            channels = {
+                "final": extract_final_channel_fallback(decoded),
+                "raw": decoded
+            }
+        return channels
     except Exception as e:
+        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
     finally:
         # Cleanup
+        try:
+            del model
+        except:
+            pass
+        gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 # -----------------------
+# Gradio handlers
 # -----------------------
+def generate_response(message: str, history: List[List[str]], system_prompt: str,
+                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
+                    do_sample: bool, seed: Optional[int],
+                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
+                    rose_tokens: str, rose_json: str,
+                    show_thinking: bool = False,
+                    reasoning_effort: str = "high") -> str:
+    """
+    Generate response with proper CoT handling using Harmony format.
+    """
     try:
+        # Build message list
+        messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
+        # Add history
+        if history:
+            for turn in history:
+                if isinstance(turn, (list, tuple)) and len(turn) >= 2:
+                    user_msg, assistant_msg = turn[0], turn[1]
+                    if user_msg:
+                        messages.append({"role": "user", "content": str(user_msg)})
+                    if assistant_msg:
+                        messages.append({"role": "assistant", "content": str(assistant_msg)})
+        # Add current message
+        messages.append({"role": "user", "content": str(message)})
+        # Create Harmony-formatted prompt
+        if HARMONY_AVAILABLE:
+            prompt = create_harmony_prompt(messages, reasoning_effort)
+        else:
+            # Fallback to tokenizer template
+            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        # Build Rose map if enabled
+        rose_map: Optional[Dict[str, float]] = None
+        if rose_enable:
+            rose_map = {}
+            tok_str = (rose_tokens or "").strip()
+            if tok_str:
+                for p in [p.strip() for p in tok_str.split(",") if p.strip()]:
+                    if ":" in p:
+                        k, v = p.split(":", 1)
+                        try:
+                            rose_map[k.strip()] = float(v)
+                        except:
+                            pass
+            if rose_json:
+                try:
+                    j = json.loads(rose_json)
+                    if isinstance(j, dict):
+                        for k, v in j.items():
+                            try:
+                                rose_map[str(k)] = float(v)
+                            except:
+                                pass
+                except:
+                    pass
+            if not rose_map:
+                rose_map = None
+        # Generate with model
+        channels = zerogpu_generate(
+            prompt,
+            {
+                "do_sample": bool(do_sample),
+                "temperature": float(temperature),
+                "top_p": float(top_p),
+                "top_k": int(top_k) if top_k > 0 else None,
+                "max_new_tokens": int(max_new_tokens),
+            },
+            rose_map,
+            float(rose_alpha),
+            float(rose_score) if rose_score is not None else None,
+            int(seed) if seed is not None else None,
         )
+        # Format response
         if show_thinking:
+            # Show all channels
+            response = "## Chain of Thought:\n\n"
+            for channel, content in channels.items():
+                if channel != "final" and content:
+                    response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
+            response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
+            return response
         else:
+            # Just show the final response
+            return channels.get("final", "No final response generated")
     except Exception as e:
+        return f"[Error] {type(e).__name__}: {str(e)}"
 # -----------------------
+# UI
 # -----------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # Mirel ��� Harmony Chain-of-Thought Inference
+        OSS-20B model using Harmony format with thinking channels.
+        The model thinks through problems in internal channels before providing a final response.
+        **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
+        """
+    )
+    with gr.Row():
+        system_prompt = gr.Textbox(
+            label="System Prompt",
+            value=SYSTEM_DEF,
+            lines=2
+        )
+    with gr.Accordion("Generation Settings", open=False):
+        with gr.Row():
+            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
+            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
+        with gr.Row():
+            max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
+            do_sample = gr.Checkbox(value=True, label="Do sample")
+            seed = gr.Number(value=None, label="Seed (optional)", precision=0)
+        with gr.Row():
+            reasoning_effort = gr.Radio(
+                choices=["low", "medium", "high"],
+                value="high",
+                label="Reasoning Effort",
+                info="How much thinking the model should do"
             )
+            show_thinking = gr.Checkbox(
+                value=False,
+                label="Show thinking channels",
+                info="Display all internal reasoning channels"
+            )
+    with gr.Accordion("Rose Guidance (Optional)", open=False):
+        gr.Markdown("Fine-tune generation with token biases")
+        with gr.Row():
+            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
+            rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
+            rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
+        rose_tokens = gr.Textbox(
+            label="Token:weight pairs",
+            placeholder="example:1.5, test:-0.5",
+            value=""
         )
+        rose_json = gr.Textbox(
+            label="JSON weights",
+            placeholder='{"token": 1.0, "another": -0.5}',
+            value=""
         )
+    # Chat interface - using only valid parameters
+    chat = gr.ChatInterface(
+        fn=generate_response,
+        additional_inputs=[
+            system_prompt, temperature, top_p, top_k, max_new,
+            do_sample, seed, rose_enable, rose_alpha, rose_score,
+            rose_tokens, rose_json, show_thinking, reasoning_effort
+        ],
+        title="Chat with Mirel",
+        description="A chain-of-thought model using Harmony format",
+        examples=[
+            ["Hello! Can you introduce yourself?"],
+            ["What is the capital of France?"],
+            ["Explain quantum computing in simple terms"],
+            ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
+        ],
+        cache_examples=False,
+    )
+    gr.Markdown(
+        """
+        ---
+        ### Configuration:
+        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
+        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
+        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
+        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
+        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
+        """
+    )
 if __name__ == "__main__":
+    demo.queue(max_size=8 if ZEROGPU else 32).launch(
+        server_name="0.0.0.0",
         server_port=7860,
+        share=True
     )