Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 21 days ago

Commit

02fd900

1 Parent(s): c709572

tweaks, maybe works?

Browse files

Files changed (1) hide show

app.py +309 -50

app.py CHANGED Viewed

@@ -1,64 +1,323 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
 """
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 """
+Mirel Harmony Inference – HF Space (Gradio)
+ZeroGPU-ready + Harmony formatting + optional Rose-guided decoding
+Single-file app.py
+Env (Spaces → Settings → Variables):
+  MODEL_ID          : base or merged model (e.g., "openai/gpt-oss-20b" or path to merged)
+  ADAPTER_ID        : optional PEFT repo/path (e.g., "AbstractPhil/mirel-gpt-oss-20b")
+  ADAPTER_SUBFOLDER : optional subfolder inside adapter repo (e.g., "checkpoints/checkpoint-516")
+  ZEROGPU           : "1" to enable lazy load/unload (memory saver)
+  LOAD_4BIT         : "1" to attempt 4-bit (bitsandbytes)
+  DTYPE             : bf16 | fp16 | fp32  (default bf16)
+  ATTN_IMPL         : eager | flash_attention_2 (default eager)
+  SYSTEM_PROMPT     : default system message (default: "You are Mirel.")
+  MAX_NEW_TOKENS    : default max new tokens (int, default 512)
+  HF_TOKEN          : (optional) if you need private repo access
+requirements.txt:
+  transformers>=4.43.0
+  accelerate>=0.33.0
+  peft>=0.11.0
+  gradio>=4.36.0
+  torch>=2.3.0
+  bitsandbytes>=0.43.1
 """
+from __future__ import annotations
+import os, gc, json, threading, torch
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Any
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+# -----------------------
+# Config & runtime modes
+# -----------------------
+DTYPE_MAP = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
+MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
+ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
+ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
+ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
+DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
+SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel.")
+MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "512"))
+ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
+LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
+# Optional: authenticate if HF_TOKEN provided (for private artifacts)
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    try:
+        from huggingface_hub import login
+        login(token=HF_TOKEN, add_to_git_credential=True)
+    except Exception:
+        pass
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Tokenizer is lightweight; load once
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+# -----------------------
+# Lazy model loader (ZeroGPU-ready)
+# -----------------------
+_model = None
+_model_lock = threading.Lock()
+try:
+    from peft import PeftModel
+    _HAS_PEFT = True
+except Exception:
+    _HAS_PEFT = False
+def _load_model() -> AutoModelForCausalLM:
+    """Load model (and adapter if provided). In ZEROGPU mode, place on cuda if available, else cpu."""
+    global _model
+    if _model is not None:
+        return _model
+    kwargs: Dict[str, Any] = dict(
+        torch_dtype=DTYPE,
+        device_map=None if ZEROGPU else "auto",
+        attn_implementation=ATTN_IMPL,
+        trust_remote_code=True,
+    )
+    if LOAD_4BIT:
+        try:
+            import bitsandbytes as bnb  # noqa: F401
+            kwargs.update(load_in_4bit=True)
+        except Exception:
+            pass
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **kwargs)
+    if ADAPTER_ID:
+        if not _HAS_PEFT:
+            raise RuntimeError("peft is required when ADAPTER_ID is set.")
+        peft_kwargs = dict()
+        if ADAPTER_SUBFOLDER:
+            peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
+    model.eval()
+    model.config.use_cache = True
+    # In ZeroGPU we control placement explicitly
+    if ZEROGPU:
+        if torch.cuda.is_available():
+            model = model.to("cuda")
+        else:
+            model = model.to("cpu")
+    _model = model
+    return _model
+def _unload_model_if_zerogpu():
+    """Aggressive unload to cooperate with ZeroGPU/limited VRAM."""
+    global _model
+    if not ZEROGPU:
+        return
+    try:
+        if _model is not None:
+            _model.to("cpu")
+        del _model
+    except Exception:
+        pass
+    _model = None
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+# -----------------------
+# Harmony formatting
+# -----------------------
+def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
+    """Prefer tokenizer.chat_template; fallback to minimal Harmony-like format."""
+    tmpl = getattr(tokenizer, "chat_template", None)
+    if tmpl:
+        return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    sys_txt = ""
+    if messages and messages[0]["role"] == "system":
+        sys_txt = "<<SYS>>\n" + messages[0]["content"] + "\n<</SYS>>\n\n"
+        messages = messages[1:]
+    convo = []
+    for m in messages:
+        if m["role"] == "user":
+            convo.append("<|user|>\n" + m["content"] + "\n<|end|>")
+        elif m["role"] == "assistant":
+            convo.append("<|assistant|>\n" + m["content"] + "\n<|end|>")
+    return sys_txt + "\n".join(convo) + "\n<|assistant|>\n"
+# -----------------------
+# Optional Rose guidance (logits bias)
+# -----------------------
+def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
+    """Create vocab bias from {token: weight}. Unknown tokens ignored. Positive promotes, negative demotes."""
+    vocab_size = len(tokenizer)
+    bias = torch.zeros(vocab_size, dtype=torch.float32)
+    for tok, w in mapping.items():
+        if tok is None:
+            continue
+        tid = tokenizer.convert_tokens_to_ids(tok)
+        if isinstance(tid, list):
+            for t in tid:
+                if isinstance(t, int) and t >= 0:
+                    bias[t] += float(w) / max(1, len(tid))
+        elif isinstance(tid, int) and tid >= 0:
+            bias[tid] += float(w)
+    return bias
+class RoseGuidedLogits(torch.nn.Module):
+    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
+        super().__init__()
+        self.bias_vec = bias_vec
+        self.alpha = float(alpha)
+    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        return scores + self.alpha * self.bias_vec.to(scores.device)
+# -----------------------
+# Gradio handlers
+# -----------------------
+@dataclass
+class GenCfg:
+    temperature: float
+    top_p: float
+    top_k: int
+    max_new_tokens: int
+    do_sample: bool
+    seed: Optional[int]
+def chat_to_messages(history: List[List[str]], system_prompt: str) -> List[Dict[str, str]]:
+    msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
+    for u, a in history:
+        if u is not None:
+            msgs.append({"role": "user", "content": u})
+        if a:
+            msgs.append({"role": "assistant", "content": a})
+    return msgs
+def generate_stream(message: str, history: List[List[str]], system_prompt: str,
+                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
+                    do_sample: bool, seed: int | None,
+                    rose_enable: bool, rose_alpha: float, rose_tokens: str, rose_json: str):
+    cfg = GenCfg(temperature, top_p, top_k, max_new_tokens, do_sample, seed)
+    if cfg.seed is not None:
+        torch.manual_seed(int(cfg.seed))
+    msgs = chat_to_messages(history, system_prompt)
+    msgs.append({"role": "user", "content": message})
+    prompt = to_harmony_prompt(msgs)
+    # Lazy load
+    global _model
+    with _model_lock:
+        _model = _load_model()
+    inputs = tokenizer(prompt, return_tensors="pt").to(_model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Optional logits processor
+    logits_processor = None
+    if rose_enable:
+        token_map: Dict[str, float] = {}
+        rose_tokens = (rose_tokens or "").strip()
+        if rose_tokens:
+            # "token:weight, token2:weight"
+            parts = [p.strip() for p in rose_tokens.split(",") if p.strip()]
+            for p in parts:
+                if ":" in p:
+                    k, v = p.split(":", 1)
+                    try:
+                        token_map[k.strip()] = float(v)
+                    except Exception:
+                        pass
+        if rose_json:
+            try:
+                j = json.loads(rose_json)
+                if isinstance(j, dict):
+                    for k, v in j.items():
+                        try:
+                            token_map[str(k)] = float(v)
+                        except Exception:
+                            pass
+            except Exception:
+                pass
+        if token_map:
+            bias = build_bias_from_tokens(tokenizer, token_map).to(_model.device)
+            logits_processor = [RoseGuidedLogits(bias, rose_alpha)]
+    gen_kwargs = dict(
+        **inputs,
+        do_sample=cfg.do_sample,
+        temperature=cfg.temperature,
+        top_p=cfg.top_p,
+        top_k=cfg.top_k if cfg.top_k > 0 else None,
+        max_new_tokens=cfg.max_new_tokens,
+        pad_token_id=tokenizer.eos_token_id,
+        streamer=streamer,
+        logits_processor=logits_processor,
+    )
+    thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
+    thread.start()
+    partial = ""
+    for token in streamer:
+        partial += token
+        yield partial
+    if ZEROGPU:
+        with _model_lock:
+            _unload_model_if_zerogpu()
+# -----------------------
+# UI
+# -----------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # Mirel – Harmony Inference (ZeroGPU‑ready)
+    OSS‑20B + optional Rose‑SFT adapter. Harmony chat template is applied automatically.
+    """)
+    with gr.Row():
+        system_prompt = gr.Textbox(label="System", value=SYSTEM_DEF)
+    with gr.Accordion("Generation settings", open=False):
+        with gr.Row():
+            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
+            top_p       = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="top_p")
+            top_k       = gr.Slider(0, 200, value=0, step=1, label="top_k (0=off)")
+            max_new     = gr.Slider(16, 2048, value=MAX_DEF, step=8, label="max_new_tokens")
+            do_sample   = gr.Checkbox(value=True, label="do_sample")
+            seed        = gr.Number(value=None, label="seed (optional)")
+    with gr.Accordion("Rose guidance (optional)", open=False):
+        with gr.Row():
+            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias at decode")
+            rose_alpha  = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="rose alpha (strength)")
+        rose_tokens = gr.Textbox(label="token:weight list (comma-separated)", value="")
+        rose_json   = gr.Textbox(label="JSON {token: weight}", value="")
+    chat = gr.ChatInterface(
+        fn=generate_stream,
+        chatbot=gr.Chatbot(show_copy_button=True, likeable=True, render_markdown=True),
+        additional_inputs=[system_prompt, temperature, top_p, top_k, max_new, do_sample, seed, rose_enable, rose_alpha, rose_tokens, rose_json],
+        title="Mirel",
+        concurrency_limit=2 if ZEROGPU else 4,
+        cache_examples=False,
+    )
+    gr.Markdown("""
+    **Notes**
+    - Set env `ZEROGPU=1` to enable just‑in‑time load and aggressive unload per request.
+    - Set `ADAPTER_ID=AbstractPhil/mirel-gpt-oss-20b` and `ADAPTER_SUBFOLDER=checkpoints/checkpoint-516` to use the provided adapter.
+    - For large contexts on A100/H100 prefer `DTYPE=bf16` and `ATTN_IMPL=eager` unless FA2 is installed.
+    - Rose guidance is optional; it biases logits without changing model weights.
+    """)
 if __name__ == "__main__":
+    demo.queue(max_size=8 if ZEROGPU else 32, concurrency_count=2 if ZEROGPU else 4).launch(server_name="0.0.0.0", server_port=7860)