Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 23 days ago

Commit

a272f29

1 Parent(s): 73c138b

yes

Browse files

Files changed (1) hide show

app.py +51 -112

app.py CHANGED Viewed

@@ -31,66 +31,14 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, toke
 if tokenizer.pad_token_id is None:
     tokenizer.pad_token_id = tokenizer.eos_token_id
-# -----------------------
-# Rose helpers
-# -----------------------
-def _parse_rose_inputs(rose_tokens: str, rose_json: str) -> Optional[Dict[str, float]]:
-    """Merge "token:weight, ..." and JSON {token: weight} into a dict."""
-    mapping: Dict[str, float] = {}
-    if rose_tokens:
-        for part in [p.strip() for p in rose_tokens.split(",") if p.strip()]:
-            if ":" in part:
-                k, v = part.split(":", 1)
-                try:
-                    mapping[k.strip()] = float(v)
-                except Exception:
-                    pass
-    if rose_json:
-        try:
-            j = json.loads(rose_json)
-            if isinstance(j, dict):
-                for k, v in j.items():
-                    try:
-                        mapping[str(k)] = float(v)
-                    except Exception:
-                        pass
-        except Exception:
-            pass
-    return mapping or None
-class _RoseLogits(torch.nn.Module):
-    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
-        super().__init__()
-        self.bias_vec = bias_vec
-        self.alpha = float(alpha)
-    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        return scores + self.alpha * self.bias_vec.to(scores.device)
-def _bias_from_tokens(tok, mapping: Dict[str, float]) -> torch.Tensor:
-    bias = torch.zeros(len(tok), dtype=torch.float32)
-    for s, w in mapping.items():
-        tid = tok.convert_tokens_to_ids(s)
-        if isinstance(tid, list):
-            for t in tid:
-                if isinstance(t, int) and t >= 0:
-                    bias[t] += float(w) / max(1, len(tid))
-        elif isinstance(tid, int) and t >= 0:
-            bias[tid] += float(w)
-    return bias
 # -----------------------
 # ZeroGPU inference (GPU work ONLY inside this function)
 # -----------------------
 @spaces.GPU(duration=120)
-def gpu_generate(prompt_str: str,
-                 temperature: float,
-                 max_new_tokens: int,
-                 rose_tokens: str,
-                 rose_json: str,
-                 rose_alpha: float,
-                 seed: Optional[int]) -> str:
-    """Run a single completion on GPU and return only the generated text.
-    No Harmony. Uses chat template; slices completion by prompt length.
     """
     torch.set_grad_enabled(False)
     model = None
@@ -99,47 +47,46 @@ def gpu_generate(prompt_str: str,
             torch.manual_seed(int(seed))
         from peft import PeftModel
-        # Load base model on GPU via accelerate's device_map
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            device_map="auto",
             torch_dtype="auto",
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             token=HF_TOKEN,
         )
         if ADAPTER_ID:
             peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
             if ADAPTER_SUBFOLDER:
                 peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-            model = PeftModel.from_pretrained(model, ADAPTER_ID, **peft_kwargs)
         model.eval()
         if getattr(model.config, "pad_token_id", None) is None:
             model.config.pad_token_id = tokenizer.pad_token_id
         device = next(model.parameters()).device
         enc = tokenizer(prompt_str, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in enc.items()}
-        if "attention_mask" not in inputs:
-            inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
-        prompt_len = int(inputs["input_ids"].shape[1])
-        # Rose bias (optional)
-        logits_processor = None
-        mapping = _parse_rose_inputs(rose_tokens, rose_json)
-        if mapping:
-            bias = _bias_from_tokens(tokenizer, mapping).to(device)
-            logits_processor = [_RoseLogits(bias, float(rose_alpha))]
-        out = model.generate(
-            **inputs,
-            do_sample=True,
-            temperature=float(temperature),
             max_new_tokens=int(max_new_tokens),
             pad_token_id=model.config.pad_token_id,
-            logits_processor=logits_processor,
         )
-        new_ids = out[0, prompt_len:]
         return tokenizer.decode(new_ids, skip_special_tokens=True)
     except Exception as e:
         return f"[Error] {type(e).__name__}: {e}"
@@ -152,28 +99,31 @@ def gpu_generate(prompt_str: str,
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-# -----------------------
-# Gradio glue (no streaming; minimal controls)
-# -----------------------
-def _build_messages(message, history) -> List[Dict[str, str]]:
-    msgs: List[Dict[str, str]] = []
-    # Keep it simple: prepend a small system to steady tone
-    msgs.append({"role": "system", "content": "You are Mirel."})
-    if isinstance(history, list):
-        for m in history:
-            if isinstance(m, dict) and "role" in m:
-                msgs.append({"role": m["role"], "content": str(m.get("content", ""))})
-            elif isinstance(m, (list, tuple)) and len(m) >= 2:
-                u, a = m[0], m[1]
-                if u: msgs.append({"role": "user", "content": str(u)})
-                if a: msgs.append({"role": "assistant", "content": str(a)})
-    if isinstance(message, dict):
-        msgs.append({"role": message.get("role", "user"), "content": str(message.get("content", ""))})
-    else:
-        msgs.append({"role": "user", "content": str(message)})
-    return msgs
 def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_tokens, rose_json, seed):
     try:
         msgs = _build_messages(message, history)
@@ -184,24 +134,13 @@ def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # Mirel – Rose LoRA Inference (ZeroGPU)
-    Minimal chat using your Rose LoRA adapter. No Harmony. GPU work runs under ZeroGPU.
     """)
-    with gr.Accordion("Generation", open=True):
-        temperature = gr.Slider(0.0, 2.0, value=0.6, step=0.05, label="Temperature")
-        max_new = gr.Slider(16, 2048, value=512, step=8, label="Max new tokens")
-        seed = gr.Number(value=None, label="Seed (optional)", precision=0)
-    with gr.Accordion("Rose guidance", open=False):
-        rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
-        rose_tokens = gr.Textbox(label="token:weight comma list", placeholder="e.g. reason:1.2, simple:-0.4", value="")
-        rose_json = gr.Textbox(label="JSON {token: weight}", placeholder='{"reason": 1.0, "ramble": -0.8}', value="")
     gr.ChatInterface(
         fn=ui_generate,
         type="messages",
-        additional_inputs=[temperature, max_new, rose_alpha, rose_tokens, rose_json, seed],
         title="Mirel",
         cache_examples=False,
     )

 if tokenizer.pad_token_id is None:
     tokenizer.pad_token_id = tokenizer.eos_token_id
 # -----------------------
 # ZeroGPU inference (GPU work ONLY inside this function)
 # -----------------------
 @spaces.GPU(duration=120)
+def gpu_generate(prompt_str: str, seed: Optional[int] = None, max_new_tokens: int = 512) -> str:
+    """Minimal generation using GPT-OSS-20B + Rose LoRA.
+    - All CUDA work stays inside this function (ZeroGPU-safe).
+    - No Harmony, no extra knobs; rely on model defaults.
     """
     torch.set_grad_enabled(False)
     model = None
             torch.manual_seed(int(seed))
         from peft import PeftModel
+        model_kwargs = dict(
+            attn_implementation="eager",
             torch_dtype="auto",
+            use_cache=True,
+            device_map="auto",
             trust_remote_code=True,
             low_cpu_mem_usage=True,
             token=HF_TOKEN,
         )
+        base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
         if ADAPTER_ID:
             peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
             if ADAPTER_SUBFOLDER:
                 peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+            peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID, **peft_kwargs)
+            model = peft_model.merge_and_unload()
+        else:
+            model = base_model
         model.eval()
         if getattr(model.config, "pad_token_id", None) is None:
             model.config.pad_token_id = tokenizer.pad_token_id
         device = next(model.parameters()).device
         enc = tokenizer(prompt_str, return_tensors="pt")
+        input_ids = enc["input_ids"].to(device)
+        attention_mask = enc.get("attention_mask")
+        if attention_mask is None:
+            attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
+        prompt_len = int(input_ids.shape[-1])
+        output_ids = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
             max_new_tokens=int(max_new_tokens),
             pad_token_id=model.config.pad_token_id,
         )
+        new_ids = output_ids[0, prompt_len:]
         return tokenizer.decode(new_ids, skip_special_tokens=True)
     except Exception as e:
         return f"[Error] {type(e).__name__}: {e}"
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+def ui_generate(message, history):
+    try:
+        # ChatInterface(type='messages') gives OpenAI-style dicts.
+        msgs: List[Dict[str, str]] = []
+        # Keep defaults: no explicit system beyond template defaults
+        if isinstance(history, list):
+            for m in history:
+                if isinstance(m, dict) and "role" in m:
+                    msgs.append({"role": m.get("role", "user"), "content": str(m.get("content", ""))})
+                elif isinstance(m, (list, tuple)) and len(m) >= 2:
+                    u, a = m[0], m[1]
+                    if u:
+                        msgs.append({"role": "user", "content": str(u)})
+                    if a:
+                        msgs.append({"role": "assistant", "content": str(a)})
+        if isinstance(message, dict):
+            msgs.append({"role": message.get("role", "user"), "content": str(message.get("content", ""))})
+        else:
+            msgs.append({"role": "user", "content": str(message)})
+        prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
+        return gpu_generate(prompt)
+    except Exception as e:
+        return f"[Error] {type(e).__name__}: {e}"
 def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_tokens, rose_json, seed):
     try:
         msgs = _build_messages(message, history)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # Mirel – Rose LoRA (ZeroGPU, minimal)
+    Loads GPT‑OSS‑20B + Rose LoRA and generates with default settings.
     """)
     gr.ChatInterface(
         fn=ui_generate,
         type="messages",
         title="Mirel",
         cache_examples=False,
     )