Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 22 days ago

Commit

ec38870

1 Parent(s): 01d8622

refactor smaller

Browse files

Files changed (1) hide show

app.py +301 -457

app.py CHANGED Viewed

@@ -1,532 +1,376 @@
 """
 Mirel Harmony Inference – HF Space (Gradio)
-ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
-Chain-of-thought model with proper channel extraction using openai_harmony
-Single file: app.py
 """
-from __future__ import annotations
-import os, gc, json, threading, torch
-from dataclasses import dataclass
-from typing import List, Dict, Optional, Any
-from datetime import datetime
 import gradio as gr
-import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Import Harmony components
 try:
-    from openai_harmony import (
-        Author,
-        Conversation,
-        HarmonyEncodingName,
-        Message,
-        Role,
-        SystemContent,
-        DeveloperContent,
-        load_harmony_encoding,
-        ReasoningEffort
-    )
-    HARMONY_AVAILABLE = True
 except ImportError:
-    print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
-    HARMONY_AVAILABLE = False
 # -----------------------
-# Config & runtime modes
 # -----------------------
-DTYPE_MAP = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
-MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
-ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
-ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
-ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
-DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
-SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
-MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "1024"))
-ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
-LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
-# Harmony channels for CoT
-REQUIRED_CHANNELS = ["thinking", "analysis", "final"]
-# HF Auth - properly handle multiple token env var names
-HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-    or os.getenv("HF_ACCESS_TOKEN")
 )
-def _hf_login() -> None:
-    """Login to HF Hub using common env secret names."""
-    if HF_TOKEN:
-        try:
-            from huggingface_hub import login, whoami
-            login(token=HF_TOKEN, add_to_git_credential=True)
-            try:
-                who = whoami(token=HF_TOKEN)
-                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
-            except Exception:
-                print("[HF Auth] Login successful but couldn't get user info")
-        except Exception as e:
-            print(f"[HF Auth] Login failed: {e}")
-    else:
-        print("[HF Auth] No token found in environment variables")
-# Login before loading any models
-_hf_login()
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Load Harmony encoding if available
-if HARMONY_AVAILABLE:
-    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-else:
-    harmony_encoding = None
-# Tokenizer is lightweight; load once
-try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
-    print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
-except Exception as e:
-    print(f"[Model] Failed to load tokenizer: {e}")
-    raise
 # -----------------------
-# Model loading
 # -----------------------
-try:
-    from peft import PeftModel
-    _HAS_PEFT = True
-except Exception:
-    _HAS_PEFT = False
-def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
-    kw: Dict[str, Any] = dict(
-        torch_dtype=DTYPE,
-        device_map=device_map,
-        attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,
-        token=HF_TOKEN,
-    )
-    if LOAD_4BIT and device_map != "cpu":
-        try:
-            import bitsandbytes as _bnb
-            kw.update(load_in_4bit=True)
-            if kw["device_map"] is None:
-                kw["device_map"] = "auto"
-        except Exception:
-            pass
-    return kw
-def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
-    print(f"[Model] Loading base model from {MODEL_ID}...")
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
-    if ADAPTER_ID:
-        if not _HAS_PEFT:
-            raise RuntimeError("peft is required when ADAPTER_ID is set.")
-        print(f"[Model] Loading adapter from {ADAPTER_ID}...")
-        peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
-        if ADAPTER_SUBFOLDER:
-            peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
-    model.eval()
-    model.config.use_cache = True
-    print("[Model] Model loaded successfully")
     return model
-# -----------------------
-# Harmony formatting
-# -----------------------
-def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> str:
-    """Create a proper Harmony-formatted prompt using openai_harmony."""
-    if not HARMONY_AVAILABLE:
-        # Fallback to tokenizer's chat template
-        return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-    # Map reasoning effort
-    effort_map = {
-        "low": ReasoningEffort.LOW,
-        "medium": ReasoningEffort.MEDIUM,
-        "high": ReasoningEffort.HIGH,
-    }
-    effort = effort_map.get(reasoning_effort.lower(), ReasoningEffort.HIGH)
-    # Create system message with channels
-    system_content = (
-        SystemContent.new()
-        .with_model_identity(messages[0]["content"] if messages else SYSTEM_DEF)
-        .with_reasoning_effort(effort)
-        .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
-        .with_knowledge_cutoff("2025-01")
-        .with_required_channels(REQUIRED_CHANNELS)
-    )
-    # Build conversation
-    harmony_messages = [
-        Message.from_role_and_content(Role.SYSTEM, system_content)
-    ]
-    # Add user/assistant messages
-    for msg in messages[1:]:  # Skip system message as we already added it
-        if msg["role"] == "user":
-            harmony_messages.append(
-                Message.from_role_and_content(Role.USER, msg["content"])
-            )
-        elif msg["role"] == "assistant":
-            # For assistant messages, we might want to preserve channels if they exist
-            harmony_messages.append(
-                Message.from_role_and_content(Role.ASSISTANT, msg["content"])
-                .with_channel("final")  # Default to final channel
-            )
-    # Create conversation and render
-    convo = Conversation.from_messages(harmony_messages)
-    tokens = harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
-    # Convert tokens back to text for the model
-    return tokenizer.decode(tokens)
-def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
-    """Parse response tokens using Harmony format to extract channels."""
-    if not HARMONY_AVAILABLE:
-        # Fallback: just decode and extract final channel manually
-        text = tokenizer.decode(tokens, skip_special_tokens=False)
-        return {"final": extract_final_channel_fallback(text), "raw": text}
-    # Parse messages from completion tokens
-    parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
-    # Extract content by channel
-    channels = {}
-    for msg in parsed_messages:
-        channel = msg.channel if hasattr(msg, 'channel') else "final"
-        if channel not in channels:
-            channels[channel] = ""
-        channels[channel] += msg.content
-    # Ensure we have a final channel
-    if "final" not in channels:
-        channels["final"] = " ".join(channels.values())
-    return channels
-def extract_final_channel_fallback(text: str) -> str:
-    """Fallback extraction when harmony library isn't available."""
-    # Look for the final channel marker
     final_marker = "<|channel|>final<|message|>"
     if final_marker in text:
         parts = text.split(final_marker)
         if len(parts) > 1:
             final_text = parts[-1]
-            # Clean up end markers
-            end_markers = ["<|return|>", "<|end|>", "<|endoftext|>"]
-            for marker in end_markers:
                 if marker in final_text:
                     final_text = final_text.split(marker)[0]
             return final_text.strip()
-    # If no channel markers found, return cleaned text
     return text.strip()
 # -----------------------
-# Rose guidance
 # -----------------------
-def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
-    """Create vocab bias from {token: weight}."""
-    vocab_size = len(tokenizer)
-    bias = torch.zeros(vocab_size, dtype=torch.float32)
-    for tok, w in mapping.items():
-        if tok is None:
-            continue
-        tid = tokenizer.convert_tokens_to_ids(tok)
-        if isinstance(tid, list):
-            for t in tid:
-                if isinstance(t, int) and t >= 0:
-                    bias[t] += float(w) / max(1, len(tid))
-        elif isinstance(tid, int) and tid >= 0:
-            bias[tid] += float(w)
-    return bias
-class RoseGuidedLogits(torch.nn.Module):
-    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
-        super().__init__()
-        self.bias_vec = bias_vec
-        self.alpha = float(alpha)
-    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        return scores + self.alpha * self.bias_vec.to(scores.device)
-@spaces.GPU(duration=120)
-def zerogpu_generate(full_prompt: str,
-                    gen_kwargs: Dict[str, Any],
-                    rose_map: Optional[Dict[str, float]],
-                    rose_alpha: float,
-                    rose_score: Optional[float],
-                    seed: Optional[int]) -> Dict[str, str]:
-    """Run inference on GPU and return parsed channels."""
     try:
-        if seed is not None:
-            torch.manual_seed(int(seed))
-        # Load model
-        model = _load_model_on("auto")
-        # Setup logits processor for Rose guidance
-        logits_processor = None
-        if rose_map:
-            bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
-            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
-            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
-        # Tokenize input
-        inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
         # Generate
-        out_ids = model.generate(
-            **inputs,
-            do_sample=bool(gen_kwargs.get("do_sample", True)),
-            temperature=float(gen_kwargs.get("temperature", 0.7)),
-            top_p=float(gen_kwargs.get("top_p", 0.9)),
-            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
-            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            logits_processor=logits_processor,
-        )
-        # Extract generated tokens only
-        prompt_len = int(inputs["input_ids"].shape[1])
-        gen_ids = out_ids[0][prompt_len:].tolist()
-        # Parse response with Harmony
-        if HARMONY_AVAILABLE:
-            channels = parse_harmony_response(gen_ids)
-        else:
-            # Fallback
-            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-            channels = {
-                "final": extract_final_channel_fallback(decoded),
-                "raw": decoded
-            }
-        return channels
     except Exception as e:
-        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
     finally:
         # Cleanup
-        try:
-            del model
-        except:
-            pass
-        gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 # -----------------------
-# Gradio handlers
 # -----------------------
-def generate_response(message: str, history: List[List[str]], system_prompt: str,
-                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
-                    do_sample: bool, seed: Optional[int],
-                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
-                    rose_tokens: str, rose_json: str,
-                    show_thinking: bool = False,
-                    reasoning_effort: str = "high") -> str:
-    """
-    Generate response with proper CoT handling using Harmony format.
-    """
     try:
-        # Build message list
-        messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
-        # Add history
-        if history:
-            for turn in history:
-                if isinstance(turn, (list, tuple)) and len(turn) >= 2:
-                    user_msg, assistant_msg = turn[0], turn[1]
-                    if user_msg:
-                        messages.append({"role": "user", "content": str(user_msg)})
-                    if assistant_msg:
-                        messages.append({"role": "assistant", "content": str(assistant_msg)})
-        # Add current message
-        messages.append({"role": "user", "content": str(message)})
-        # Create Harmony-formatted prompt
-        if HARMONY_AVAILABLE:
-            prompt = create_harmony_prompt(messages, reasoning_effort)
-        else:
-            # Fallback to tokenizer template
-            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        # Build Rose map if enabled
-        rose_map: Optional[Dict[str, float]] = None
-        if rose_enable:
-            rose_map = {}
-            tok_str = (rose_tokens or "").strip()
-            if tok_str:
-                for p in [p.strip() for p in tok_str.split(",") if p.strip()]:
-                    if ":" in p:
-                        k, v = p.split(":", 1)
-                        try:
-                            rose_map[k.strip()] = float(v)
-                        except:
-                            pass
-            if rose_json:
-                try:
-                    j = json.loads(rose_json)
-                    if isinstance(j, dict):
-                        for k, v in j.items():
-                            try:
-                                rose_map[str(k)] = float(v)
-                            except:
-                                pass
-                except:
-                    pass
-            if not rose_map:
-                rose_map = None
-        # Generate with model
-        channels = zerogpu_generate(
-            prompt,
-            {
-                "do_sample": bool(do_sample),
-                "temperature": float(temperature),
-                "top_p": float(top_p),
-                "top_k": int(top_k) if top_k > 0 else None,
-                "max_new_tokens": int(max_new_tokens),
-            },
-            rose_map,
-            float(rose_alpha),
-            float(rose_score) if rose_score is not None else None,
-            int(seed) if seed is not None else None,
         )
-        # Format response
         if show_thinking:
-            # Show all channels
-            response = "## Chain of Thought:\n\n"
-            for channel, content in channels.items():
-                if channel != "final" and content:
-                    response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
-            response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
-            return response
         else:
-            # Just show the final response
-            return channels.get("final", "No final response generated")
     except Exception as e:
-        return f"[Error] {type(e).__name__}: {str(e)}"
 # -----------------------
-# UI
 # -----------------------
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # Mirel – Harmony Chain-of-Thought Inference
-        OSS-20B model using Harmony format with thinking channels.
-        The model thinks through problems in internal channels before providing a final response.
-        **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
-        """
-    )
-    with gr.Row():
-        system_prompt = gr.Textbox(
-            label="System Prompt",
-            value=SYSTEM_DEF,
-            lines=2
-        )
-    with gr.Accordion("Generation Settings", open=False):
-        with gr.Row():
-            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
-            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
-            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
-        with gr.Row():
-            max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
-            do_sample = gr.Checkbox(value=True, label="Do sample")
-            seed = gr.Number(value=None, label="Seed (optional)", precision=0)
         with gr.Row():
-            reasoning_effort = gr.Radio(
-                choices=["low", "medium", "high"],
-                value="high",
-                label="Reasoning Effort",
-                info="How much thinking the model should do"
-            )
-            show_thinking = gr.Checkbox(
-                value=False,
-                label="Show thinking channels",
-                info="Display all internal reasoning channels"
             )
-    with gr.Accordion("Rose Guidance (Optional)", open=False):
-        gr.Markdown("Fine-tune generation with token biases")
-        with gr.Row():
-            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
-            rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
-            rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
-        rose_tokens = gr.Textbox(
-            label="Token:weight pairs",
-            placeholder="example:1.5, test:-0.5",
-            value=""
         )
-        rose_json = gr.Textbox(
-            label="JSON weights",
-            placeholder='{"token": 1.0, "another": -0.5}',
-            value=""
         )
-    # Chat interface - using only valid parameters
-    chat = gr.ChatInterface(
-        fn=generate_response,
-        additional_inputs=[
-            system_prompt, temperature, top_p, top_k, max_new,
-            do_sample, seed, rose_enable, rose_alpha, rose_score,
-            rose_tokens, rose_json, show_thinking, reasoning_effort
-        ],
-        title="Chat with Mirel",
-        description="A chain-of-thought model using Harmony format",
-        examples=[
-            ["Hello! Can you introduce yourself?"],
-            ["What is the capital of France?"],
-            ["Explain quantum computing in simple terms"],
-            ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
-        ],
-        cache_examples=False,
-    )
-    gr.Markdown(
-        """
-        ---
-        ### Configuration:
-        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
-        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
-        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
-        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
-        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
-        """
-    )
 if __name__ == "__main__":
-    demo.queue(max_size=8 if ZEROGPU else 32).launch(
-        server_name="0.0.0.0",
         server_port=7860,
-        share=False
     )

 """
 Mirel Harmony Inference – HF Space (Gradio)
+Simplified version with robust error handling
 """
+import os
+import gc
+import json
+import torch
 import gradio as gr
+from typing import List, Dict, Optional, Any, Generator
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Check if spaces is available
 try:
+    import spaces
+    SPACES_AVAILABLE = True
 except ImportError:
+    SPACES_AVAILABLE = False
+    print("[WARNING] spaces not available, running without ZeroGPU")
 # -----------------------
+# Config
 # -----------------------
+MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
+ADAPTER_ID = os.getenv("ADAPTER_ID")
+ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER")
+SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "You are Mirel, a helpful assistant.")
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512"))
+DTYPE = os.getenv("DTYPE", "bf16")
+ZEROGPU = os.getenv("ZEROGPU", "0") == "1"
+# HF Token
+HF_TOKEN = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
 )
+if HF_TOKEN:
+    try:
+        from huggingface_hub import login
+        login(token=HF_TOKEN)
+        print("[Auth] Logged in to Hugging Face")
+    except Exception as e:
+        print(f"[Auth] Failed to login: {e}")
 # -----------------------
+# Model Loading
 # -----------------------
+print(f"[Model] Loading tokenizer from {MODEL_ID}")
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    token=HF_TOKEN
+)
+model = None
+def get_dtype():
+    """Get the appropriate dtype for the model."""
+    if DTYPE == "bf16" and torch.cuda.is_available():
+        return torch.bfloat16
+    elif DTYPE == "fp16":
+        return torch.float16
+    else:
+        return torch.float32
+def load_model():
+    """Load the model (called inside GPU context if using ZeroGPU)."""
+    global model
+    if model is None:
+        print(f"[Model] Loading model from {MODEL_ID}")
+        kwargs = {
+            "torch_dtype": get_dtype(),
+            "device_map": "auto" if torch.cuda.is_available() else "cpu",
+            "trust_remote_code": True,
+            "token": HF_TOKEN,
+            "low_cpu_mem_usage": True,
+        }
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **kwargs)
+        # Load adapter if specified
+        if ADAPTER_ID:
+            try:
+                from peft import PeftModel
+                print(f"[Model] Loading adapter from {ADAPTER_ID}")
+                adapter_kwargs = {"token": HF_TOKEN}
+                if ADAPTER_SUBFOLDER:
+                    adapter_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+                model = PeftModel.from_pretrained(
+                    model,
+                    ADAPTER_ID,
+                    is_trainable=False,
+                    **adapter_kwargs
+                )
+            except ImportError:
+                print("[WARNING] PEFT not installed, skipping adapter")
+            except Exception as e:
+                print(f"[WARNING] Failed to load adapter: {e}")
+        model.eval()
     return model
+def extract_final_response(text: str) -> str:
+    """Extract the final channel from chain-of-thought output."""
+    # Look for final channel marker
     final_marker = "<|channel|>final<|message|>"
     if final_marker in text:
         parts = text.split(final_marker)
         if len(parts) > 1:
             final_text = parts[-1]
+            # Clean end markers
+            for marker in ["<|return|>", "<|end|>", "<|endoftext|>"]:
                 if marker in final_text:
                     final_text = final_text.split(marker)[0]
             return final_text.strip()
+    # No channel markers, return cleaned text
     return text.strip()
 # -----------------------
+# Generation Function
 # -----------------------
+def generate_text(
+    prompt: str,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
+    top_k: int = 0,
+    max_new_tokens: int = 512,
+    do_sample: bool = True,
+) -> str:
+    """Generate text using the model."""
     try:
+        # Load/get model
+        model_instance = load_model()
+        # Tokenize
+        inputs = tokenizer(prompt, return_tensors="pt")
+        if torch.cuda.is_available():
+            inputs = inputs.to("cuda")
         # Generate
+        with torch.no_grad():
+            outputs = model_instance.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k if top_k > 0 else None,
+                do_sample=do_sample,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        # Decode
+        prompt_len = inputs["input_ids"].shape[1]
+        generated_ids = outputs[0][prompt_len:]
+        response = tokenizer.decode(generated_ids, skip_special_tokens=False)
+        return response
     except Exception as e:
+        error_msg = f"Generation error: {str(e)}"
+        print(f"[ERROR] {error_msg}")
+        return error_msg
     finally:
         # Cleanup
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        gc.collect()
+# Add GPU decorator if available
+if SPACES_AVAILABLE and ZEROGPU:
+    generate_text = spaces.GPU(duration=120)(generate_text)
 # -----------------------
+# Chat Function
 # -----------------------
+def chat_fn(
+    message: str,
+    history: List[List[str]],
+    system_prompt: str,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    max_new_tokens: int,
+    do_sample: bool,
+    show_thinking: bool,
+) -> str:
+    """Main chat function for Gradio."""
     try:
+        # Build conversation
+        messages = [{"role": "system", "content": system_prompt or SYSTEM_PROMPT}]
+        for user_msg, assistant_msg in (history or []):
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                messages.append({"role": "assistant", "content": assistant_msg})
+        messages.append({"role": "user", "content": message})
+        # Apply chat template
+        try:
+            prompt = tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=False
+            )
+        except Exception:
+            # Fallback to simple format
+            prompt = f"{system_prompt}\n\n"
+            for msg in messages[1:]:
+                role = msg["role"].upper()
+                content = msg["content"]
+                prompt += f"{role}: {content}\n"
+            prompt += "ASSISTANT: "
+        # Generate response
+        full_response = generate_text(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=int(top_k),
+            max_new_tokens=int(max_new_tokens),
+            do_sample=do_sample,
         )
+        # Process response
         if show_thinking:
+            # Show full output with channels
+            final = extract_final_response(full_response)
+            return f"**Full Output:**\n```\n{full_response}\n```\n\n**Final Response:**\n{final}"
         else:
+            # Just show final response
+            return extract_final_response(full_response)
     except Exception as e:
+        error_msg = f"Chat error: {str(e)}"
+        print(f"[ERROR] {error_msg}")
+        return error_msg
 # -----------------------
+# Gradio Interface
 # -----------------------
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="Mirel Chat") as demo:
+        gr.Markdown(
+            """
+            # Mirel - Chain-of-Thought Chat
+            Chat with a model that thinks before responding.
+            """
+        )
         with gr.Row():
+            with gr.Column(scale=4):
+                chatbot = gr.Chatbot(height=500)
+                msg = gr.Textbox(
+                    label="Message",
+                    placeholder="Type your message here...",
+                    lines=2
+                )
+                with gr.Row():
+                    submit = gr.Button("Send", variant="primary")
+                    clear = gr.Button("Clear")
+            with gr.Column(scale=1):
+                system_prompt = gr.Textbox(
+                    label="System Prompt",
+                    value=SYSTEM_PROMPT,
+                    lines=3
+                )
+                with gr.Accordion("Settings", open=False):
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=2.0,
+                        value=0.7,
+                        step=0.1,
+                        label="Temperature"
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.9,
+                        step=0.1,
+                        label="Top-p"
+                    )
+                    top_k = gr.Slider(
+                        minimum=0,
+                        maximum=100,
+                        value=0,
+                        step=1,
+                        label="Top-k (0=disabled)"
+                    )
+                    max_new_tokens = gr.Slider(
+                        minimum=64,
+                        maximum=2048,
+                        value=MAX_NEW_TOKENS,
+                        step=64,
+                        label="Max Tokens"
+                    )
+                    do_sample = gr.Checkbox(
+                        value=True,
+                        label="Do Sample"
+                    )
+                    show_thinking = gr.Checkbox(
+                        value=False,
+                        label="Show Thinking Process"
+                    )
+        # Event handlers
+        def user_submit(message, history):
+            return "", history + [[message, None]]
+        def bot_respond(history, system, temp, top_p, top_k, max_tokens, sample, thinking):
+            if not history or not history[-1][0]:
+                return history
+            user_message = history[-1][0]
+            bot_message = chat_fn(
+                user_message,
+                history[:-1],  # Don't include current turn
+                system,
+                temp,
+                top_p,
+                top_k,
+                max_tokens,
+                sample,
+                thinking
             )
+            history[-1][1] = bot_message
+            return history
+        msg.submit(
+            user_submit,
+            [msg, chatbot],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            bot_respond,
+            [chatbot, system_prompt, temperature, top_p, top_k, max_new_tokens, do_sample, show_thinking],
+            chatbot
         )
+        submit.click(
+            user_submit,
+            [msg, chatbot],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            bot_respond,
+            [chatbot, system_prompt, temperature, top_p, top_k, max_new_tokens, do_sample, show_thinking],
+            chatbot
         )
+        clear.click(lambda: None, None, chatbot, queue=False)
+    return demo
+# -----------------------
+# Main
+# -----------------------
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.queue(max_size=10)
+    demo.launch(
+        server_name="0.0.0.0",
         server_port=7860,
+        share=True
     )