Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 21 days ago

Commit

bbabb73

1 Parent(s): 40292d5

finally got claude to add harmony format

Browse files

Files changed (2) hide show

app.py +277 -253
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,17 +1,36 @@
 """
 Mirel Harmony Inference – HF Space (Gradio)
 ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
-Chain-of-thought model with proper channel extraction
 Single file: app.py
 """
 from __future__ import annotations
-import os, gc, json, threading, torch, traceback
 from dataclasses import dataclass
-from typing import List, Dict, Optional, Any, Iterator
 import gradio as gr
 import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # -----------------------
 # Config & runtime modes
 # -----------------------
@@ -23,11 +42,14 @@ ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
 DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
 SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
-MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "512"))
-ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "1")) == "1"
 LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
-# HF Auth - check for token in environment or use OAuth
 HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
@@ -35,20 +57,34 @@ HF_TOKEN: Optional[str] = (
     or os.getenv("HF_ACCESS_TOKEN")
 )
-# For private model access via token (if not using OAuth)
-if HF_TOKEN:
-    try:
-        from huggingface_hub import login
-        login(token=HF_TOKEN, add_to_git_credential=True)
-        print(f"[HF Auth] Using token from environment")
-    except Exception as e:
-        print(f"[HF Auth] Token login failed: {e}")
-else:
-    print("[HF Auth] No token in environment - OAuth will be available in UI")
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Tokenizer is lightweight; load once (pass token for private models)
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
     print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
@@ -57,11 +93,8 @@ except Exception as e:
     raise
 # -----------------------
-# Lazy model loader (ZeroGPU-friendly)
 # -----------------------
-_model: Optional[AutoModelForCausalLM] = None
-_model_lock = threading.Lock()
 try:
     from peft import PeftModel
     _HAS_PEFT = True
@@ -76,12 +109,11 @@ def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
         attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
-        token=HF_TOKEN,  # Add token here for private model access
     )
-    # Only enable 4-bit when not explicitly CPU-bound
     if LOAD_4BIT and device_map != "cpu":
         try:
-            import bitsandbytes as _bnb  # noqa: F401
             kw.update(load_in_4bit=True)
             if kw["device_map"] is None:
                 kw["device_map"] = "auto"
@@ -109,30 +141,88 @@ def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
     return model
 # -----------------------
-# Harmony formatting & CoT extraction
 # -----------------------
-def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
-    """
-    Strict Harmony: rely on the tokenizer's official chat template.
-    """
-    tmpl = getattr(tokenizer, "chat_template", None)
-    if not tmpl:
-        raise RuntimeError(
-            "Missing Harmony chat_template on this tokenizer. Use a Harmony-enabled repo (e.g., openai/gpt-oss-20b)."
-        )
-    return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-def extract_final_channel(text: str) -> str:
-    """
-    Extract the final channel from chain-of-thought output.
-    The model outputs thinking in internal channels and final response in final channel.
-    """
     # Look for the final channel marker
     final_marker = "<|channel|>final<|message|>"
     if final_marker in text:
-        # Extract everything after the final channel marker
         parts = text.split(final_marker)
         if len(parts) > 1:
             final_text = parts[-1]
@@ -145,16 +235,15 @@ def extract_final_channel(text: str) -> str:
             return final_text.strip()
-    # If no channel markers found, return the cleaned text
-    # (might be a non-CoT response or error)
     return text.strip()
 # -----------------------
-# Optional Rose guidance (logits bias)
 # -----------------------
 def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
-    """Create vocab bias from {token: weight}. Unknown tokens ignored."""
     vocab_size = len(tokenizer)
     bias = torch.zeros(vocab_size, dtype=torch.float32)
     for tok, w in mapping.items():
@@ -178,168 +267,110 @@ class RoseGuidedLogits(torch.nn.Module):
     def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         return scores + self.alpha * self.bias_vec.to(scores.device)
-# Use appropriate decorator based on whether ZeroGPU is enabled
-if ZEROGPU:
-    @spaces.GPU(duration=120)
-    def zerogpu_generate(full_prompt: str,
-                        gen_kwargs: Dict[str, Any],
-                        rose_map: Optional[Dict[str, float]],
-                        rose_alpha: float,
-                        rose_score: Optional[float],
-                        seed: Optional[int]) -> str:
-        """Run inference on GPU (ZeroGPU-safe)."""
-        try:
-            if seed is not None:
-                torch.manual_seed(int(seed))
-            # Load model
-            model = _load_model_on("auto")
-            # Setup logits processor for Rose guidance
-            logits_processor = None
-            if rose_map:
-                bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
-                eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
-                logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
-            # Tokenize input
-            inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
-            # Non-streaming generation
-            out_ids = model.generate(
-                **inputs,
-                do_sample=bool(gen_kwargs.get("do_sample", True)),
-                temperature=float(gen_kwargs.get("temperature", 0.7)),
-                top_p=float(gen_kwargs.get("top_p", 0.9)),
-                top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
-                max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                logits_processor=logits_processor,
-            )
-            # Decode the full output (including special tokens for CoT)
-            prompt_len = int(inputs["input_ids"].shape[1])
-            gen_ids = out_ids[0][prompt_len:]
             decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-            return decoded
-        except Exception as e:
-            error_msg = f"Generation failed: {str(e)}"
-            print(f"[Error] {error_msg}")
-            print(traceback.format_exc())
-            return error_msg
-        finally:
-            # Cleanup
-            try:
-                del model
-            except:
-                pass
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-else:
-    def zerogpu_generate(full_prompt: str,
-                        gen_kwargs: Dict[str, Any],
-                        rose_map: Optional[Dict[str, float]],
-                        rose_alpha: float,
-                        rose_score: Optional[float],
-                        seed: Optional[int]) -> str:
-        """Run inference without ZeroGPU decorator."""
-        # Same implementation as above but without the decorator
         try:
-            if seed is not None:
-                torch.manual_seed(int(seed))
-            # Load model
-            model = _load_model_on("auto" if torch.cuda.is_available() else "cpu")
-            # Setup logits processor for Rose guidance
-            logits_processor = None
-            if rose_map:
-                bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
-                eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
-                logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
-            # Tokenize input
-            inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
-            # Non-streaming generation
-            out_ids = model.generate(
-                **inputs,
-                do_sample=bool(gen_kwargs.get("do_sample", True)),
-                temperature=float(gen_kwargs.get("temperature", 0.7)),
-                top_p=float(gen_kwargs.get("top_p", 0.9)),
-                top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
-                max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                logits_processor=logits_processor,
-            )
-            # Decode the full output (including special tokens for CoT)
-            prompt_len = int(inputs["input_ids"].shape[1])
-            gen_ids = out_ids[0][prompt_len:]
-            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-            return decoded
-        except Exception as e:
-            error_msg = f"Generation failed: {str(e)}"
-            print(f"[Error] {error_msg}")
-            print(traceback.format_exc())
-            return error_msg
-        finally:
-            # Cleanup
-            try:
-                del model
-            except:
-                pass
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
 # -----------------------
 # Gradio handlers
 # -----------------------
-def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, str]]:
-    msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
-    for item in history:
-        if not item:
-            continue
-        if isinstance(item, dict) and "role" in item:
-            msgs.append(item)
-            continue
-        if isinstance(item, (list, tuple)) and len(item) == 2:
-            u, a = item
-            if u is not None:
-                msgs.append({"role": "user", "content": str(u)})
-            if a:
-                msgs.append({"role": "assistant", "content": str(a)})
-    return msgs
-def generate_response(message: Any, history: List[Any], system_prompt: str,
-                      temperature: float, top_p: float, top_k: int, max_new_tokens: int,
-                      do_sample: bool, seed: Optional[int],
-                      rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
-                      rose_tokens: str, rose_json: str,
-                      show_thinking: bool = False) -> str:
     """
-    Non-streaming response generator for ChatInterface.
-    Returns a complete response to avoid h11 Content-Length issues.
     """
     try:
-        # Normalize message and build Harmony prompt
-        if isinstance(message, dict):
-            message = message.get("content", "")
-        msgs = chat_to_messages(history, system_prompt)
-        msgs.append({"role": "user", "content": str(message)})
-        prompt = to_harmony_prompt(msgs)
         # Build Rose map if enabled
         rose_map: Optional[Dict[str, float]] = None
@@ -367,9 +398,9 @@ def generate_response(message: Any, history: List[Any], system_prompt: str,
                     pass
             if not rose_map:
                 rose_map = None
         # Generate with model
-        full_output = zerogpu_generate(
             prompt,
             {
                 "do_sample": bool(do_sample),
@@ -384,57 +415,43 @@ def generate_response(message: Any, history: List[Any], system_prompt: str,
             int(seed) if seed is not None else None,
         )
-        # Extract final response from CoT output
         if show_thinking:
-            # Show the full chain-of-thought process
-            return f"**Full Output (with thinking):**\n```\n{full_output}\n```\n\n**Final Response:**\n{extract_final_channel(full_output)}"
         else:
             # Just show the final response
-            return extract_final_channel(full_output)
     except Exception as e:
-        error_msg = f"⚠️ Error: {str(e)}"
-        print(f"[Error in generate_response] {error_msg}")
-        print(traceback.format_exc())
-        return error_msg
 # -----------------------
 # UI
 # -----------------------
-css = """
-#chatbot {
-    height: 500px;
-}
-.gradio-container {
-    max-width: 1200px !important;
-}
-"""
-with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown(
         """
-        # Mirel – Harmony Inference (ZeroGPU-ready)
-        Chain-of-thought OSS-20B model with Harmony formatting.
-        The model thinks through problems internally before providing a final response.
-        **Auth:** Set `HF_TOKEN` in Space secrets or add `hf_oauth: true` to README for browser auth.
         """
     )
     with gr.Row():
-        with gr.Column(scale=3):
-            system_prompt = gr.Textbox(
-                label="System Prompt",
-                value=SYSTEM_DEF,
-                lines=2
-            )
-        with gr.Column(scale=1):
-            show_thinking = gr.Checkbox(
-                value=False,
-                label="Show thinking process",
-                info="Display internal CoT reasoning"
-            )
     with gr.Accordion("Generation Settings", open=False):
         with gr.Row():
@@ -445,6 +462,18 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
             max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
             do_sample = gr.Checkbox(value=True, label="Do sample")
             seed = gr.Number(value=None, label="Seed (optional)", precision=0)
     with gr.Accordion("Rose Guidance (Optional)", open=False):
         gr.Markdown("Fine-tune generation with token biases")
@@ -466,46 +495,41 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     # Chat interface
     chat = gr.ChatInterface(
         fn=generate_response,
-        chatbot=gr.Chatbot(elem_id="chatbot", height=500, type="messages"),
         additional_inputs=[
             system_prompt, temperature, top_p, top_k, max_new,
             do_sample, seed, rose_enable, rose_alpha, rose_score,
-            rose_tokens, rose_json, show_thinking
         ],
-        title=None,  # Title already in markdown
-        description=None,  # Description already in markdown
         cache_examples=False,
     )
     gr.Markdown(
         """
         ---
-        ### Configuration Notes:
-        **Authentication Options:**
-        1. **Browser OAuth**: Click "Sign in with Hugging Face" above (easiest)
-        2. **Environment Token**: Set `HF_TOKEN` in Space secrets
-        3. **No Auth**: Works for public models only
-        **Important:** For OAuth to work in Spaces, add `hf_oauth: true` to your README.md metadata
-        **Other Settings:**
         - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
-        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER` for PEFT adapters
-        - **ZeroGPU**: Set `ZEROGPU=1` for Spaces with ZeroGPU (default: enabled)
-        - **4-bit**: Set `LOAD_4BIT=1` to enable 4-bit quantization
-        The model uses internal "thinking" channels before producing a final response.
-        Enable "Show thinking process" to see the full chain-of-thought.
         """
     )
 if __name__ == "__main__":
-    # Simple queue configuration
-    demo.queue(
-        max_size=10,
-    ).launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,
     )

 """
 Mirel Harmony Inference – HF Space (Gradio)
 ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
+Chain-of-thought model with proper channel extraction using openai_harmony
 Single file: app.py
 """
 from __future__ import annotations
+import os, gc, json, threading, torch
 from dataclasses import dataclass
+from typing import List, Dict, Optional, Any
+from datetime import datetime
 import gradio as gr
 import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Import Harmony components
+try:
+    from openai_harmony import (
+        Author,
+        Conversation,
+        HarmonyEncodingName,
+        Message,
+        Role,
+        SystemContent,
+        DeveloperContent,
+        load_harmony_encoding,
+        ReasoningEffort
+    )
+    HARMONY_AVAILABLE = True
+except ImportError:
+    print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
+    HARMONY_AVAILABLE = False
 # -----------------------
 # Config & runtime modes
 # -----------------------
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
 DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
 SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
+MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "1024"))
+ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
 LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
+# Harmony channels for CoT
+REQUIRED_CHANNELS = ["thinking", "analysis", "final"]
+# HF Auth - properly handle multiple token env var names
 HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
+def _hf_login() -> None:
+    """Login to HF Hub using common env secret names."""
+    if HF_TOKEN:
+        try:
+            from huggingface_hub import login, whoami
+            login(token=HF_TOKEN, add_to_git_credential=True)
+            try:
+                who = whoami(token=HF_TOKEN)
+                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
+            except Exception:
+                print("[HF Auth] Login successful but couldn't get user info")
+        except Exception as e:
+            print(f"[HF Auth] Login failed: {e}")
+    else:
+        print("[HF Auth] No token found in environment variables")
+# Login before loading any models
+_hf_login()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Load Harmony encoding if available
+if HARMONY_AVAILABLE:
+    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+else:
+    harmony_encoding = None
+# Tokenizer is lightweight; load once
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
     print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
     raise
 # -----------------------
+# Model loading
 # -----------------------
 try:
     from peft import PeftModel
     _HAS_PEFT = True
         attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
+        token=HF_TOKEN,
     )
     if LOAD_4BIT and device_map != "cpu":
         try:
+            import bitsandbytes as _bnb
             kw.update(load_in_4bit=True)
             if kw["device_map"] is None:
                 kw["device_map"] = "auto"
     return model
 # -----------------------
+# Harmony formatting
 # -----------------------
+def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> str:
+    """Create a proper Harmony-formatted prompt using openai_harmony."""
+    if not HARMONY_AVAILABLE:
+        # Fallback to tokenizer's chat template
+        return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    # Map reasoning effort
+    effort_map = {
+        "low": ReasoningEffort.LOW,
+        "medium": ReasoningEffort.MEDIUM,
+        "high": ReasoningEffort.HIGH,
+    }
+    effort = effort_map.get(reasoning_effort.lower(), ReasoningEffort.HIGH)
+    # Create system message with channels
+    system_content = (
+        SystemContent.new()
+        .with_model_identity(messages[0]["content"] if messages else SYSTEM_DEF)
+        .with_reasoning_effort(effort)
+        .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
+        .with_knowledge_cutoff("2025-01")
+        .with_required_channels(REQUIRED_CHANNELS)
+    )
+    # Build conversation
+    harmony_messages = [
+        Message.from_role_and_content(Role.SYSTEM, system_content)
+    ]
+    # Add user/assistant messages
+    for msg in messages[1:]:  # Skip system message as we already added it
+        if msg["role"] == "user":
+            harmony_messages.append(
+                Message.from_role_and_content(Role.USER, msg["content"])
+            )
+        elif msg["role"] == "assistant":
+            # For assistant messages, we might want to preserve channels if they exist
+            harmony_messages.append(
+                Message.from_role_and_content(Role.ASSISTANT, msg["content"])
+                .with_channel("final")  # Default to final channel
+            )
+    # Create conversation and render
+    convo = Conversation.from_messages(harmony_messages)
+    tokens = harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
+    # Convert tokens back to text for the model
+    return tokenizer.decode(tokens)
+def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
+    """Parse response tokens using Harmony format to extract channels."""
+    if not HARMONY_AVAILABLE:
+        # Fallback: just decode and extract final channel manually
+        text = tokenizer.decode(tokens, skip_special_tokens=False)
+        return {"final": extract_final_channel_fallback(text), "raw": text}
+    # Parse messages from completion tokens
+    parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
+    # Extract content by channel
+    channels = {}
+    for msg in parsed_messages:
+        channel = msg.channel if hasattr(msg, 'channel') else "final"
+        if channel not in channels:
+            channels[channel] = ""
+        channels[channel] += msg.content
+    # Ensure we have a final channel
+    if "final" not in channels:
+        channels["final"] = " ".join(channels.values())
+    return channels
+def extract_final_channel_fallback(text: str) -> str:
+    """Fallback extraction when harmony library isn't available."""
     # Look for the final channel marker
     final_marker = "<|channel|>final<|message|>"
     if final_marker in text:
         parts = text.split(final_marker)
         if len(parts) > 1:
             final_text = parts[-1]
             return final_text.strip()
+    # If no channel markers found, return cleaned text
     return text.strip()
 # -----------------------
+# Rose guidance
 # -----------------------
 def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
+    """Create vocab bias from {token: weight}."""
     vocab_size = len(tokenizer)
     bias = torch.zeros(vocab_size, dtype=torch.float32)
     for tok, w in mapping.items():
     def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         return scores + self.alpha * self.bias_vec.to(scores.device)
+@spaces.GPU(duration=120)
+def zerogpu_generate(full_prompt: str,
+                    gen_kwargs: Dict[str, Any],
+                    rose_map: Optional[Dict[str, float]],
+                    rose_alpha: float,
+                    rose_score: Optional[float],
+                    seed: Optional[int]) -> Dict[str, str]:
+    """Run inference on GPU and return parsed channels."""
+    try:
+        if seed is not None:
+            torch.manual_seed(int(seed))
+        # Load model
+        model = _load_model_on("auto")
+        # Setup logits processor for Rose guidance
+        logits_processor = None
+        if rose_map:
+            bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
+            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
+            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
+        # Tokenize input
+        inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
+        # Generate
+        out_ids = model.generate(
+            **inputs,
+            do_sample=bool(gen_kwargs.get("do_sample", True)),
+            temperature=float(gen_kwargs.get("temperature", 0.7)),
+            top_p=float(gen_kwargs.get("top_p", 0.9)),
+            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            logits_processor=logits_processor,
+        )
+        # Extract generated tokens only
+        prompt_len = int(inputs["input_ids"].shape[1])
+        gen_ids = out_ids[0][prompt_len:].tolist()
+        # Parse response with Harmony
+        if HARMONY_AVAILABLE:
+            channels = parse_harmony_response(gen_ids)
+        else:
+            # Fallback
             decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+            channels = {
+                "final": extract_final_channel_fallback(decoded),
+                "raw": decoded
+            }
+        return channels
+    except Exception as e:
+        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
+    finally:
+        # Cleanup
         try:
+            del model
+        except:
+            pass
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 # -----------------------
 # Gradio handlers
 # -----------------------
+def generate_response(message: str, history: List[List[str]], system_prompt: str,
+                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
+                    do_sample: bool, seed: Optional[int],
+                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
+                    rose_tokens: str, rose_json: str,
+                    show_thinking: bool = False,
+                    reasoning_effort: str = "high") -> str:
     """
+    Generate response with proper CoT handling using Harmony format.
     """
     try:
+        # Build message list
+        messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
+        # Add history
+        if history:
+            for turn in history:
+                if isinstance(turn, (list, tuple)) and len(turn) >= 2:
+                    user_msg, assistant_msg = turn[0], turn[1]
+                    if user_msg:
+                        messages.append({"role": "user", "content": str(user_msg)})
+                    if assistant_msg:
+                        messages.append({"role": "assistant", "content": str(assistant_msg)})
+        # Add current message
+        messages.append({"role": "user", "content": str(message)})
+        # Create Harmony-formatted prompt
+        if HARMONY_AVAILABLE:
+            prompt = create_harmony_prompt(messages, reasoning_effort)
+        else:
+            # Fallback to tokenizer template
+            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         # Build Rose map if enabled
         rose_map: Optional[Dict[str, float]] = None
                     pass
             if not rose_map:
                 rose_map = None
         # Generate with model
+        channels = zerogpu_generate(
             prompt,
             {
                 "do_sample": bool(do_sample),
             int(seed) if seed is not None else None,
         )
+        # Format response
         if show_thinking:
+            # Show all channels
+            response = "## Chain of Thought:\n\n"
+            for channel, content in channels.items():
+                if channel != "final" and content:
+                    response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
+            response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
+            return response
         else:
             # Just show the final response
+            return channels.get("final", "No final response generated")
     except Exception as e:
+        return f"[Error] {type(e).__name__}: {str(e)}"
 # -----------------------
 # UI
 # -----------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # Mirel – Harmony Chain-of-Thought Inference
+        OSS-20B model using Harmony format with thinking channels.
+        The model thinks through problems in internal channels before providing a final response.
+        **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
         """
     )
     with gr.Row():
+        system_prompt = gr.Textbox(
+            label="System Prompt",
+            value=SYSTEM_DEF,
+            lines=2
+        )
     with gr.Accordion("Generation Settings", open=False):
         with gr.Row():
             max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
             do_sample = gr.Checkbox(value=True, label="Do sample")
             seed = gr.Number(value=None, label="Seed (optional)", precision=0)
+        with gr.Row():
+            reasoning_effort = gr.Radio(
+                choices=["low", "medium", "high"],
+                value="high",
+                label="Reasoning Effort",
+                info="How much thinking the model should do"
+            )
+            show_thinking = gr.Checkbox(
+                value=False,
+                label="Show thinking channels",
+                info="Display all internal reasoning channels"
+            )
     with gr.Accordion("Rose Guidance (Optional)", open=False):
         gr.Markdown("Fine-tune generation with token biases")
     # Chat interface
     chat = gr.ChatInterface(
         fn=generate_response,
         additional_inputs=[
             system_prompt, temperature, top_p, top_k, max_new,
             do_sample, seed, rose_enable, rose_alpha, rose_score,
+            rose_tokens, rose_json, show_thinking, reasoning_effort
+        ],
+        title="Chat with Mirel",
+        description="A chain-of-thought model using Harmony format",
+        examples=[
+            ["Hello! Can you introduce yourself?"],
+            ["What is the capital of France?"],
+            ["Explain quantum computing in simple terms"],
+            ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
         ],
         cache_examples=False,
+        retry_btn="Retry",
+        undo_btn="Undo",
+        clear_btn="Clear",
     )
     gr.Markdown(
         """
         ---
+        ### Configuration:
         - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
+        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
+        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
+        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
+        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
         """
     )
 if __name__ == "__main__":
+    demo.queue(max_size=8 if ZEROGPU else 32).launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False
     )

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ accelerate>=0.33.0
 peft>=0.11.0
 gradio>=5.38.0
 torch>=2.4.0   # ZeroGPU-supported (2.3.x is NOT supported)
-bitsandbytes>=0.43.1

 peft>=0.11.0
 gradio>=5.38.0
 torch>=2.4.0   # ZeroGPU-supported (2.3.x is NOT supported)
+bitsandbytes>=0.43.1
+openai_harmony