Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 23 days ago

Commit

73c138b

1 Parent(s): 4e7580d

super condensed test

Browse files

Files changed (1) hide show

app.py +138 -745

app.py CHANGED Viewed

@@ -1,282 +1,75 @@
 """
-Mirel Harmony Inference – HF Space (Gradio)
-ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
-Chain-of-thought model with proper channel extraction using openai_harmony
 Single file: app.py
 """
 from __future__ import annotations
-import os, gc, json, threading, torch
-from dataclasses import dataclass
-from typing import List, Dict, Optional, Any
-from datetime import datetime
 import gradio as gr
-import spaces  # required for ZeroGPU
-from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
-# Import Harmony components
-try:
-    from openai_harmony import (
-        Author,
-        Conversation,
-        HarmonyEncodingName,
-        Message,
-        Role,
-        SystemContent,
-        DeveloperContent,
-        load_harmony_encoding,
-        ReasoningEffort
-    )
-    HARMONY_AVAILABLE = True
-except ImportError:
-    print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
-    HARMONY_AVAILABLE = False
 # -----------------------
-# Config & runtime modes
 # -----------------------
-DTYPE_MAP = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
-MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
-ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
-ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
-ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
-DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
-SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
-MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "256"))
-ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
-LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
-# Harmony channels for CoT
-REQUIRED_CHANNELS = ["analysis", "final"]
-# HF Auth - properly handle multiple token env var names
 HF_TOKEN: Optional[str] = (
-    os.getenv("HF_TOKEN")
-    or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
-def _hf_login() -> None:
-    """Login to HF Hub using common env secret names."""
-    if HF_TOKEN:
-        try:
-            from huggingface_hub import login, whoami
-            login(token=HF_TOKEN, add_to_git_credential=True)
-            try:
-                who = whoami(token=HF_TOKEN)
-                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
-            except Exception:
-                print("[HF Auth] Login successful but couldn't get user info")
-        except Exception as e:
-            print(f"[HF Auth] Login failed: {e}")
-    else:
-        print("[HF Auth] No token found in environment variables")
-# Login is handled by Space OAuth/session; avoid explicit CLI login here to prevent OAuth var errors
-# _hf_login()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Load Harmony encoding if available
-if HARMONY_AVAILABLE:
-    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-else:
-    harmony_encoding = None
-# Stop tokens per Harmony spec: <|return|> (200002), <|call|> (200012)
-HARMONY_STOP_IDS = harmony_encoding.stop_tokens_for_assistant_actions() if HARMONY_AVAILABLE else []
-# Tokenizer is lightweight; load once
-try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
-    print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
-except Exception as e:
-    print(f"[Model] Failed to load tokenizer: {e}")
-    raise
 # -----------------------
-# Model loading
 # -----------------------
-try:
-    from peft import PeftModel
-    _HAS_PEFT = True
-except Exception:
-    _HAS_PEFT = False
-def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
-    kw: Dict[str, Any] = dict(
-        torch_dtype=DTYPE,
-        device_map=device_map,
-        attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,
-        token=HF_TOKEN,
-    )
-    if LOAD_4BIT and device_map != "cpu":
         try:
-            import bitsandbytes as _bnb
-            kw.update(load_in_4bit=True)
-            if kw["device_map"] is None:
-                kw["device_map"] = "auto"
         except Exception:
             pass
-    return kw
-def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
-    print(f"[Model] Loading base model from {MODEL_ID}...")
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
-    if ADAPTER_ID:
-        if not _HAS_PEFT:
-            raise RuntimeError("peft is required when ADAPTER_ID is set.")
-        print(f"[Model] Loading adapter from {ADAPTER_ID}...")
-        peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
-        if ADAPTER_SUBFOLDER:
-            peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
-    model.eval()
-    # Ensure a valid pad_token_id is set; some OSS checkpoints reuse eos as pad
-    if getattr(model.config, "pad_token_id", None) is None:
-        model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    model.config.use_cache = True
-    print("[Model] Model loaded successfully")
-    return model
-# -----------------------
-# Harmony formatting
-# -----------------------
-def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> Any:
-    """Build a Harmony-formatted prompt. If Harmony is available, return **token IDs**
-    rendered by `openai_harmony` (authoritative). Otherwise fall back to the
-    tokenizer's chat template and return a string.
-    """
-    if HARMONY_AVAILABLE and harmony_encoding is not None:
-        effort_map = {"low": ReasoningEffort.LOW, "medium": ReasoningEffort.MEDIUM, "high": ReasoningEffort.HIGH}
-        effort = effort_map.get(str(reasoning_effort).lower(), ReasoningEffort.HIGH)
-        system_content = (
-            SystemContent.new()
-            .with_model_identity("You are ChatGPT, a large language model trained by OpenAI.")
-            .with_reasoning_effort(effort)
-            .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
-            .with_knowledge_cutoff("2024-06")
-            .with_required_channels(REQUIRED_CHANNELS)
-        )
-        # Use first system message as developer instructions if present, else SYSTEM_DEF
-        sys_text = SYSTEM_DEF
-        rest: List[Dict[str, str]] = messages or []
-        if rest and rest[0].get("role") == "system":
-            sys_text = rest[0].get("content") or SYSTEM_DEF
-            rest = rest[1:]
-        harmony_messages = [Message.from_role_and_content(Role.SYSTEM, system_content)]
-        dev = DeveloperContent.new().with_instructions(sys_text)
-        harmony_messages.append(Message.from_role_and_content(Role.DEVELOPER, dev))
-        for m in rest:
-            role = m.get("role"); content = m.get("content", "")
-            if role == "user":
-                harmony_messages.append(Message.from_role_and_content(Role.USER, content))
-            elif role == "assistant":
-                harmony_messages.append(
-                    Message.from_role_and_content(Role.ASSISTANT, content).with_channel("final")
-                )
-        convo = Conversation.from_messages(harmony_messages)
-        rendered = harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
-        # Ensure assistant header includes a final channel + message start to avoid 'assistantassistant...' loops
-        try:
-            _tail = tokenizer.decode(list(rendered)[-64:], skip_special_tokens=False)
-            if '<|channel|>final<|message|>' not in _tail:
-                rendered = list(rendered) + tokenizer.encode('<|channel|>final<|message|>', add_special_tokens=False)
-        except Exception:
-            rendered = list(rendered)
-        return rendered
-    # Fallback: tokenizer chat template -> string prompt
-    if not messages or messages[0].get("role") != "system":
-        messages = [{"role": "system", "content": SYSTEM_DEF}] + (messages or [])
-    return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
-    """Parse response tokens using Harmony format to extract channels."""
-    if not HARMONY_AVAILABLE:
-        # Fallback: just decode and extract final channel manually
-        text = tokenizer.decode(tokens, skip_special_tokens=False)
-        return {"final": extract_final_channel_fallback(text), "raw": text}
-    # Parse messages from completion tokens
-    parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
-    # Extract content by channel
-    channels = {}
-    for msg in parsed_messages:
-        channel = msg.channel if hasattr(msg, 'channel') else "final"
-        if channel not in channels:
-            channels[channel] = ""
-        channels[channel] += "".join([getattr(part, "text", str(part)) for part in (msg.content if isinstance(msg.content, list) else [msg.content])])
-    # Ensure we have a final channel
-    if "final" not in channels:
-        channels["final"] = " ".join(channels.values())
-    return channels
-def extract_final_channel_fallback(text: str) -> str:
-    """Robustly extract the <final> channel from decoded Harmony text.
-    Works even if parsing fails or the model emits extra headers.
-    """
-    try:
-        chunks: Dict[str, str] = {}
-        pieces = text.split("<|channel|>")
-        for seg in pieces[1:]:
-            name_end = seg.find("<|message|>")
-            if name_end <= 0:
-                continue
-            ch = seg[:name_end].strip()
-            body_start = name_end + len("<|message|>")
-            # end at next channel/end/return marker
-            next_pos = len(seg)
-            for delim in ("<|channel|>", "<|end|>", "<|return|>"):
-                p = seg.find(delim, body_start)
-                if p != -1:
-                    next_pos = min(next_pos, p)
-            body = seg[body_start:next_pos]
-            chunks[ch] = chunks.get(ch, "") + body
-        final_txt = (chunks.get("final", "").strip())
-        if final_txt:
-            return final_txt
-        # Fallback: everything after last final marker up to a terminator
-        if "<|channel|>final<|message|>" in text:
-            tail = text.split("<|channel|>final<|message|>")[-1]
-            for delim in ("<|return|>", "<|end|>", "<|channel|>"):
-                idx = tail.find(delim)
-                if idx != -1:
-                    tail = tail[:idx]
-                    break
-            return tail.strip()
-    except Exception:
-        pass
-    return text.strip()
-# -----------------------
-# Rose guidance
-# -----------------------
-def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
-    """Create vocab bias from {token: weight}."""
-    vocab_size = len(tokenizer)
-    bias = torch.zeros(vocab_size, dtype=torch.float32)
-    for tok, w in mapping.items():
-        if tok is None:
-            continue
-        tid = tokenizer.convert_tokens_to_ids(tok)
         if isinstance(tid, list):
             for t in tid:
                 if isinstance(t, int) and t >= 0:
@@ -285,184 +78,71 @@ def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor
             bias[tid] += float(w)
     return bias
-class RoseGuidedLogits(torch.nn.Module):
-    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
-        super().__init__()
-        self.bias_vec = bias_vec
-        self.alpha = float(alpha)
-    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        return scores + self.alpha * self.bias_vec.to(scores.device)
-class StopOnTokens(StoppingCriteria):
-    def __init__(self, stop_ids: List[int]):
-        self.stop_ids = set(int(s) for s in (stop_ids or []))
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
-        return int(input_ids[0, -1]) in self.stop_ids
-@spaces.GPU(duration=120)
-def zerogpu_generate(full_prompt,
-                    gen_kwargs: Dict[str, Any],
-                    rose_map: Optional[Dict[str, float]],
-                    rose_alpha: float,
-                    rose_score: Optional[float],
-                    seed: Optional[int]) -> Dict[str, str]:
-    """Run inference on GPU and return parsed channels."""
-    try:
-        if seed is not None:
-            torch.manual_seed(int(seed))
-        # Load model
-        model = _load_model_on("auto")
-        # Setup logits processor for Rose guidance
-        logits_processor = None
-        if rose_map:
-            bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
-            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
-            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
-        # Tokenize / prepare inputs
-        device = next(model.parameters()).device
-        if HARMONY_AVAILABLE and not isinstance(full_prompt, str):
-            # Accept list/tuple or any iterable of ints from openai_harmony
-            try:
-                token_list = list(full_prompt)
-            except TypeError:
-                token_list = list(getattr(full_prompt, "ids", getattr(full_prompt, "token_ids", [])))
-            if not token_list:
-                raise ValueError("Harmony prompt produced no tokens")
-            input_ids = torch.tensor([token_list], dtype=torch.long, device=device)
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
-            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
-            prompt_len = input_ids.shape[1]
-        else:
-            enc = tokenizer(full_prompt, return_tensors="pt")
-            inputs = {k: v.to(device) for k, v in enc.items()}
-            prompt_len = int(inputs["input_ids"].shape[1])
-            if "attention_mask" not in inputs:
-                inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
-        # Prepare stopping
-        sc = None
-        if HARMONY_AVAILABLE and HARMONY_STOP_IDS:
-            sc = StoppingCriteriaList([StopOnTokens(HARMONY_STOP_IDS)])
-        # Generate
-        # Disallow degenerate header loops
-        bad_words_ids = None
-        try:
-            _B = []
-            for s in ("assistantassistant", "assistant", "<|assistant|>"):
-                ids = tokenizer.encode(s, add_special_tokens=False)
-                if ids:
-                    _B.append(ids)
-            bad_words_ids = _B if _B else None
-        except Exception:
-            pass
-        out_ids = model.generate(
-            **inputs,
-            do_sample=bool(gen_kwargs.get("do_sample", True)),
-            temperature=float(gen_kwargs.get("temperature", 0.6)),
-            top_p=(float(gen_kwargs.get("top_p")) if gen_kwargs.get("top_p") is not None else None),
-            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
-            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
-            pad_token_id=model.config.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.1)),
-            no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
-            logits_processor=logits_processor,
-            stopping_criteria=sc,
-        )
-        # Extract generated tokens only
-        out_list = out_ids[0].tolist()
-        gen_ids = out_list[prompt_len:]
-        # Truncate at first Harmony stop token if present
-        if HARMONY_AVAILABLE:
-            for sid in HARMONY_STOP_IDS:
-                if sid in gen_ids:
-                    gen_ids = gen_ids[:gen_ids.index(sid)]
-                    break
-        # Parse response with Harmony
-        if HARMONY_AVAILABLE:
-            try:
-                channels = parse_harmony_response(gen_ids)
-            except Exception:
-                # Fallback to text parsing if Harmony parser fails
-                decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-                channels = {
-                    "final": extract_final_channel_fallback(decoded),
-                    "raw": decoded
-                }
-        else:
-            # Fallback decode + channels
-            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-            channels = {
-                "final": extract_final_channel_fallback(decoded),
-                "raw": decoded
-            }
-        return channels
-    except Exception as e:
-        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
-    finally:
-        # Cleanup
-        try:
-            del model
-        except:
-            pass
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
 # -----------------------
-# Simple (non-Harmony) GPU path — matches your minimal example
 # -----------------------
 @spaces.GPU(duration=120)
-def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_map: Optional[Dict[str, float]], rose_alpha: float, rose_score: Optional[float], seed: Optional[int]) -> Dict[str, str]:
-    """Straight chat_template path. No Harmony tokens. Slices completion from prompt_len.
-    Mirrors the minimal HF example and avoids header loops entirely."""
     model = None
     try:
         if seed is not None:
             torch.manual_seed(int(seed))
-        model = _load_model_on("auto")
-        device = next(model.parameters()).device
-        # Encode prompt string
         enc = tokenizer(prompt_str, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in enc.items()}
-        prompt_len = int(inputs["input_ids"].shape[1])
         if "attention_mask" not in inputs:
             inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
-        # Optional Rose bias
         logits_processor = None
-        if rose_map:
-            bias = build_bias_from_tokens(tokenizer, rose_map).to(device)
-            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
-            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
-        out_ids = model.generate(
             **inputs,
-            do_sample=bool(gen_kwargs.get("do_sample", True)),
-            temperature=float(gen_kwargs.get("temperature", 0.6)),
-            top_p=(float(gen_kwargs.get("top_p")) if gen_kwargs.get("top_p") is not None else None),
-            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
-            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
             pad_token_id=model.config.pad_token_id,
             logits_processor=logits_processor,
         )
-        # Slice generated continuation only
-        new_ids = out_ids[0, prompt_len:]
-        text = tokenizer.decode(new_ids, skip_special_tokens=True)
-        return {"final": text}
     except Exception as e:
-        return {"final": f"[Error] {type(e).__name__}: {e}"}
     finally:
         try:
             del model
@@ -473,345 +153,58 @@ def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_ma
             torch.cuda.empty_cache()
 # -----------------------
-# GPU Debug: Harmony Inspector
-# -----------------------
-@spaces.GPU(duration=120)
-def zerogpu_generate_debug(full_prompt, gen_kwargs: Dict[str, Any]) -> Dict[str, Any]:
-    """Minimal GPU path to run a single prompt and return Harmony-parsed output
-    along with short token previews for debugging. Does not use Rose for clarity."""
-    model = None
-    try:
-        model = _load_model_on("auto")
-        device = next(model.parameters()).device
-        # Prepare inputs (tokens if Harmony renderer used, else string -> encode)
-        if HARMONY_AVAILABLE and not isinstance(full_prompt, str):
-            token_list = list(full_prompt)
-            if not token_list:
-                raise ValueError("Harmony prompt produced no tokens")
-            input_ids = torch.tensor([token_list], dtype=torch.long, device=device)
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
-            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
-            prompt_len = input_ids.shape[1]
-        else:
-            enc = tokenizer(full_prompt, return_tensors="pt")
-            inputs = {k: v.to(device) for k, v in enc.items()}
-            if "attention_mask" not in inputs:
-                inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
-            prompt_len = int(inputs["input_ids"].shape[1])
-        # Harmony stop via stopping criteria
-        sc = StoppingCriteriaList([StopOnTokens(HARMONY_STOP_IDS)]) if (HARMONY_AVAILABLE and HARMONY_STOP_IDS) else None
-        out_ids = model.generate(
-            **inputs,
-            do_sample=bool(gen_kwargs.get("do_sample", True)),
-            temperature=float(gen_kwargs.get("temperature", 0.7)),
-            top_p=float(gen_kwargs.get("top_p", 0.9)),
-            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
-            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
-            pad_token_id=model.config.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            stopping_criteria=sc,
-            repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
-            no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
-        )
-        out_list = out_ids[0].tolist()
-        gen_ids = out_list[prompt_len:]
-        # Truncate at first Harmony stop token if present
-        if HARMONY_AVAILABLE and HARMONY_STOP_IDS:
-            for sid in HARMONY_STOP_IDS:
-                if sid in gen_ids:
-                    gen_ids = gen_ids[:gen_ids.index(sid)]
-                    break
-        # Parse channels
-        if HARMONY_AVAILABLE:
-            try:
-                channels = parse_harmony_response(gen_ids)
-            except Exception:
-                decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-                channels = {"final": extract_final_channel_fallback(decoded), "raw": decoded}
-        else:
-            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-            channels = {"final": extract_final_channel_fallback(decoded), "raw": decoded}
-        # Small previews (avoid flooding logs/UI)
-        preview = {
-            "prompt_len": int(prompt_len),
-            "stop_ids": list(HARMONY_STOP_IDS) if HARMONY_AVAILABLE else [],
-            "gen_len": int(len(gen_ids)),
-            "gen_ids_head": gen_ids[:48],
-            "decoded_head": tokenizer.decode(gen_ids[:256], skip_special_tokens=False),
-            "channels": channels,
-        }
-        return preview
-    except Exception as e:
-        return {"error": f"{type(e).__name__}: {e}"}
-    finally:
-        try:
-            del model
-        except Exception:
-            pass
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-# -----------------------
-# Gradio handlers
-# -----------------------
-def generate_response(message: str, history: List[List[str]], system_prompt: str,
-                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
-                    do_sample: bool, seed: Optional[int],
-                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
-                    rose_tokens: str, rose_json: str,
-                    show_thinking: bool = False,
-                    simple_mode: bool = True,  # NEW: default to simple chat_template path
-                    reasoning_effort: str = "high") -> str:
-    """
-    Generate response with proper CoT handling using Harmony format.
-    """
-    try:
-        # Build messages robustly for Gradio type='messages' or legacy tuple format
-        messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
-        # Add prior turns
-        if history:
-            if isinstance(history, list) and history and isinstance(history[0], dict):
-                # history is already a flat list of {'role','content'} dicts
-                for m in history:
-                    role = m.get("role")
-                    content = m.get("content", "")
-                    if role in ("user", "assistant"):
-                        messages.append({"role": role, "content": str(content)})
-            else:
-                for turn in history:
-                    if isinstance(turn, (list, tuple)) and len(turn) >= 2:
-                        u, a = turn[0], turn[1]
-                        if u:
-                            messages.append({"role": "user", "content": str(u)})
-                        if a:
-                            messages.append({"role": "assistant", "content": str(a)})
-        # Current user message
-        if isinstance(message, dict):
-            user_text = message.get("content", "")
-        else:
-            user_text = str(message)
-        messages.append({"role": "user", "content": user_text})
-        # FAST PATH: simple chat_template prompt (recommended)
-        if simple_mode:
-            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        # Harmony path (optional)
-        elif HARMONY_AVAILABLE:
-            prompt = create_harmony_prompt(messages, reasoning_effort)  # returns token IDs
-        else:
-            # Fallback to tokenizer template (string)
-            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        # Build Rose map if enabled
-        rose_map: Optional[Dict[str, float]] = None
-        if rose_enable:
-            rose_map = {}
-            tok_str = (rose_tokens or "").strip()
-            if tok_str:
-                for p in [p.strip() for p in tok_str.split(",") if p.strip()]:
-                    if ":" in p:
-                        k, v = p.split(":", 1)
-                        try:
-                            rose_map[k.strip()] = float(v)
-                        except:
-                            pass
-            if rose_json:
-                try:
-                    j = json.loads(rose_json)
-                    if isinstance(j, dict):
-                        for k, v in j.items():
-                            try:
-                                rose_map[str(k)] = float(v)
-                            except:
-                                pass
-                except:
-                    pass
-            if not rose_map:
-                rose_map = None
-        # Generate with model
-        if simple_mode:
-            channels = zerogpu_generate_simple(
-                prompt,
-                {
-                    "do_sample": bool(do_sample),
-                    "temperature": float(temperature),
-                    "top_p": float(top_p) if top_p is not None else None,
-                    "top_k": int(top_k) if top_k > 0 else None,
-                    "max_new_tokens": int(max_new_tokens),
-                },
-                rose_map,
-                float(rose_alpha),
-                float(rose_score) if rose_score is not None else None,
-                int(seed) if seed is not None else None,
-            )
-        else:
-            channels = zerogpu_generate(
-            prompt,
-            {
-                "do_sample": bool(do_sample),
-                "temperature": float(temperature),
-                "top_p": float(top_p),
-                "top_k": int(top_k) if top_k > 0 else None,
-                "max_new_tokens": int(max_new_tokens),
-            },
-            rose_map,
-            float(rose_alpha),
-            float(rose_score) if rose_score is not None else None,
-            int(seed) if seed is not None else None,
-        )
-        # Format response
-        if show_thinking:
-            # Show all channels
-            response = "## Chain of Thought:\n\n"
-            for channel, content in channels.items():
-                if channel != "final" and content:
-                    response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
-            response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
-            return response
-        else:
-            # Just show the final response
-            return channels.get("final", "No final response generated")
-    except Exception as e:
-        return f"[Error] {type(e).__name__}: {str(e)}"
-# -----------------------
-# Extra handler: Harmony Inspector wrapper
-# -----------------------
-def harmony_inspect_handler(user_prompt: str, system_prompt: str, reasoning_effort: str):
     try:
-        msgs = [{"role": "system", "content": system_prompt or SYSTEM_DEF}, {"role": "user", "content": user_prompt or "What is 2+2?"}]
-        prompt = create_harmony_prompt(msgs, reasoning_effort)
-        return zerogpu_generate_debug(
-            prompt,
-            {"do_sample": True, "temperature": 0.7, "top_p": 0.9, "top_k": 0, "max_new_tokens": MAX_DEF}
-        )
     except Exception as e:
-        return {"error": f"{type(e).__name__}: {e}"}
-# -----------------------
-# UI
-# -----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # Mirel – Harmony Chain-of-Thought Inference
-        OSS-20B model using Harmony format with thinking channels.
-        The model thinks through problems in internal channels before providing a final response.
-        **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
-        """
-    )
-    with gr.Row():
-        system_prompt = gr.Textbox(
-            label="System Prompt",
-            value=SYSTEM_DEF,
-            lines=2
-        )
-    with gr.Accordion("Generation Settings ", open=False):
-        # NEW: toggle to bypass Harmony and use plain chat_template like your minimal script
-        simple_mode = gr.Checkbox(
-            value=True,
-            label="Use simple chat_template (no Harmony)",
-            info="Matches the minimal HF example; safest path for now"
-        )
-        with gr.Row():
-            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
-            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
-            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
-        with gr.Row():
-            max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
-            do_sample = gr.Checkbox(value=True, label="Do sample")
-            seed = gr.Number(value=None, label="Seed (optional)", precision=0)
-        with gr.Row():
-            reasoning_effort = gr.Radio(
-                choices=["low", "medium", "high"],
-                value="high",
-                label="Reasoning Effort",
-                info="How much thinking the model should do"
-            )
-            show_thinking = gr.Checkbox(
-                value=False,
-                label="Show thinking channels",
-                info="Display all internal reasoning channels"
-            )
-    with gr.Accordion("Rose Guidance (Optional)", open=False):
-        gr.Markdown("Fine-tune generation with token biases")
-        with gr.Row():
-            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
-            rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
-            rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
-        rose_tokens = gr.Textbox(
-            label="Token:weight pairs",
-            placeholder="example:1.5, test:-0.5",
-            value=""
-        )
-        rose_json = gr.Textbox(
-            label="JSON weights",
-            placeholder='{"token": 1.0, "another": -0.5}',
-            value=""
-        )
-    # --- Harmony Inspector UI ---
-    with gr.Accordion("Harmony Inspector", open=False):
-        debug_prompt = gr.Textbox(label="Debug prompt", value="What is 2+2? Reply with just the number.")
-        run_debug = gr.Button("Run Harmony Inspect")
-        debug_out = gr.JSON(label="Parsed Harmony output", value={})
-        run_debug.click(harmony_inspect_handler, inputs=[debug_prompt, system_prompt, reasoning_effort], outputs=[debug_out])
-    # Chat interface - using only valid parameters
-    chat = gr.ChatInterface(
-        fn=generate_response,
         type="messages",
-        additional_inputs=[
-            system_prompt, temperature, top_p, top_k, max_new,
-            do_sample, seed, rose_enable, rose_alpha, rose_score,
-            rose_tokens, rose_json, show_thinking, simple_mode, reasoning_effort
-        ],
-        title="Chat with Mirel",
-        description="A chain-of-thought model using Harmony format",
-        examples=[
-            ["Hello! Can you introduce yourself?"],
-            ["What is the capital of France?"],
-            ["Explain quantum computing in simple terms"],
-            ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
-        ],
         cache_examples=False,
     )
-    gr.Markdown(
-        """
-        ---
-        ### Configuration:
-        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
-        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
-        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
-        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
-        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
-        """
-    )
 if __name__ == "__main__":
-    demo.queue(max_size=8 if ZEROGPU else 32).launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 """
+Mirel – Minimal Rose LoRA Inference (HF Space)
+ZeroGPU-only, no Harmony, no extra config
 Single file: app.py
 """
 from __future__ import annotations
+import os, gc, json, torch
+from typing import Optional, Dict, Any, List
 import gradio as gr
+import spaces
+from transformers import AutoTokenizer, AutoModelForCausalLM
 # -----------------------
+# Constants / Env
 # -----------------------
+MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
+# Default to your Rose LoRA
+ADAPTER_ID = os.getenv("ADAPTER_ID", "AbstractPhil/mirel-gpt-oss-20b")
+ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoints/checkpoint-516")
 HF_TOKEN: Optional[str] = (
+    os.getenv("HF_TOKEN")
+    or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Tokenizer is lightweight; OK to load on CPU at import time
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
 # -----------------------
+# Rose helpers
 # -----------------------
+def _parse_rose_inputs(rose_tokens: str, rose_json: str) -> Optional[Dict[str, float]]:
+    """Merge "token:weight, ..." and JSON {token: weight} into a dict."""
+    mapping: Dict[str, float] = {}
+    if rose_tokens:
+        for part in [p.strip() for p in rose_tokens.split(",") if p.strip()]:
+            if ":" in part:
+                k, v = part.split(":", 1)
+                try:
+                    mapping[k.strip()] = float(v)
+                except Exception:
+                    pass
+    if rose_json:
         try:
+            j = json.loads(rose_json)
+            if isinstance(j, dict):
+                for k, v in j.items():
+                    try:
+                        mapping[str(k)] = float(v)
+                    except Exception:
+                        pass
         except Exception:
             pass
+    return mapping or None
+class _RoseLogits(torch.nn.Module):
+    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
+        super().__init__()
+        self.bias_vec = bias_vec
+        self.alpha = float(alpha)
+    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        return scores + self.alpha * self.bias_vec.to(scores.device)
+def _bias_from_tokens(tok, mapping: Dict[str, float]) -> torch.Tensor:
+    bias = torch.zeros(len(tok), dtype=torch.float32)
+    for s, w in mapping.items():
+        tid = tok.convert_tokens_to_ids(s)
         if isinstance(tid, list):
             for t in tid:
                 if isinstance(t, int) and t >= 0:
             bias[tid] += float(w)
     return bias
 # -----------------------
+# ZeroGPU inference (GPU work ONLY inside this function)
 # -----------------------
 @spaces.GPU(duration=120)
+def gpu_generate(prompt_str: str,
+                 temperature: float,
+                 max_new_tokens: int,
+                 rose_tokens: str,
+                 rose_json: str,
+                 rose_alpha: float,
+                 seed: Optional[int]) -> str:
+    """Run a single completion on GPU and return only the generated text.
+    No Harmony. Uses chat template; slices completion by prompt length.
+    """
+    torch.set_grad_enabled(False)
     model = None
     try:
         if seed is not None:
             torch.manual_seed(int(seed))
+        from peft import PeftModel
+        # Load base model on GPU via accelerate's device_map
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            device_map="auto",
+            torch_dtype="auto",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            token=HF_TOKEN,
+        )
+        if ADAPTER_ID:
+            peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
+            if ADAPTER_SUBFOLDER:
+                peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+            model = PeftModel.from_pretrained(model, ADAPTER_ID, **peft_kwargs)
+        model.eval()
+        if getattr(model.config, "pad_token_id", None) is None:
+            model.config.pad_token_id = tokenizer.pad_token_id
+        device = next(model.parameters()).device
         enc = tokenizer(prompt_str, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in enc.items()}
         if "attention_mask" not in inputs:
             inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
+        prompt_len = int(inputs["input_ids"].shape[1])
+        # Rose bias (optional)
         logits_processor = None
+        mapping = _parse_rose_inputs(rose_tokens, rose_json)
+        if mapping:
+            bias = _bias_from_tokens(tokenizer, mapping).to(device)
+            logits_processor = [_RoseLogits(bias, float(rose_alpha))]
+        out = model.generate(
             **inputs,
+            do_sample=True,
+            temperature=float(temperature),
+            max_new_tokens=int(max_new_tokens),
             pad_token_id=model.config.pad_token_id,
             logits_processor=logits_processor,
         )
+        new_ids = out[0, prompt_len:]
+        return tokenizer.decode(new_ids, skip_special_tokens=True)
     except Exception as e:
+        return f"[Error] {type(e).__name__}: {e}"
     finally:
         try:
             del model
             torch.cuda.empty_cache()
 # -----------------------
+# Gradio glue (no streaming; minimal controls)
+# -----------------------
+def _build_messages(message, history) -> List[Dict[str, str]]:
+    msgs: List[Dict[str, str]] = []
+    # Keep it simple: prepend a small system to steady tone
+    msgs.append({"role": "system", "content": "You are Mirel."})
+    if isinstance(history, list):
+        for m in history:
+            if isinstance(m, dict) and "role" in m:
+                msgs.append({"role": m["role"], "content": str(m.get("content", ""))})
+            elif isinstance(m, (list, tuple)) and len(m) >= 2:
+                u, a = m[0], m[1]
+                if u: msgs.append({"role": "user", "content": str(u)})
+                if a: msgs.append({"role": "assistant", "content": str(a)})
+    if isinstance(message, dict):
+        msgs.append({"role": message.get("role", "user"), "content": str(message.get("content", ""))})
+    else:
+        msgs.append({"role": "user", "content": str(message)})
+    return msgs
+def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_tokens, rose_json, seed):
     try:
+        msgs = _build_messages(message, history)
+        prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
+        return gpu_generate(prompt, float(temperature), int(max_new_tokens), rose_tokens or "", rose_json or "", float(rose_alpha), int(seed) if seed is not None else None)
     except Exception as e:
+        return f"[Error] {type(e).__name__}: {e}"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # Mirel – Rose LoRA Inference (ZeroGPU)
+    Minimal chat using your Rose LoRA adapter. No Harmony. GPU work runs under ZeroGPU.
+    """)
+    with gr.Accordion("Generation", open=True):
+        temperature = gr.Slider(0.0, 2.0, value=0.6, step=0.05, label="Temperature")
+        max_new = gr.Slider(16, 2048, value=512, step=8, label="Max new tokens")
+        seed = gr.Number(value=None, label="Seed (optional)", precision=0)
+    with gr.Accordion("Rose guidance", open=False):
+        rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
+        rose_tokens = gr.Textbox(label="token:weight comma list", placeholder="e.g. reason:1.2, simple:-0.4", value="")
+        rose_json = gr.Textbox(label="JSON {token: weight}", placeholder='{"reason": 1.0, "ramble": -0.8}', value="")
+    gr.ChatInterface(
+        fn=ui_generate,
         type="messages",
+        additional_inputs=[temperature, max_new, rose_alpha, rose_tokens, rose_json, seed],
+        title="Mirel",
         cache_examples=False,
     )
 if __name__ == "__main__":
+    demo.queue(max_size=16).launch(server_name="0.0.0.0", server_port=7860)