Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 23 days ago

Commit

ec0268d

1 Parent(s): 6eb225b

yes

Browse files

Files changed (1) hide show

app.py +120 -13

app.py CHANGED Viewed

@@ -401,6 +401,91 @@ def zerogpu_generate(full_prompt,
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 # -----------------------
 # Gradio handlers
 # -----------------------
@@ -498,6 +583,21 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
     except Exception as e:
         return f"[Error] {type(e).__name__}: {str(e)}"
 # -----------------------
 # UI
 # -----------------------
@@ -560,7 +660,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         )
     # Chat interface - using only valid parameters
-    chat = gr.ChatInterface(
         fn=generate_response,
         type="messages",
         additional_inputs=[
@@ -579,18 +686,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         cache_examples=False,
     )
-    gr.Markdown(
-        """
-        ---
-        ### Configuration:
-        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
-        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
-        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
-        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
-        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
-        """
-    )
 if __name__ == "__main__":
     demo.queue(max_size=8 if ZEROGPU else 32).launch(

         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+# -----------------------
+# GPU Debug: Harmony Inspector
+# -----------------------
+@spaces.GPU(duration=120)
+def zerogpu_generate_debug(full_prompt, gen_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    """Minimal GPU path to run a single prompt and return Harmony-parsed output
+    along with short token previews for debugging. Does not use Rose for clarity."""
+    model = None
+    try:
+        model = _load_model_on("auto")
+        device = next(model.parameters()).device
+        # Prepare inputs (tokens if Harmony renderer used, else string -> encode)
+        if HARMONY_AVAILABLE and not isinstance(full_prompt, str):
+            token_list = list(full_prompt)
+            if not token_list:
+                raise ValueError("Harmony prompt produced no tokens")
+            input_ids = torch.tensor([token_list], dtype=torch.long, device=device)
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
+            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            prompt_len = input_ids.shape[1]
+        else:
+            enc = tokenizer(full_prompt, return_tensors="pt")
+            inputs = {k: v.to(device) for k, v in enc.items()}
+            if "attention_mask" not in inputs:
+                inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
+            prompt_len = int(inputs["input_ids"].shape[1])
+        # Harmony stop via stopping criteria
+        sc = StoppingCriteriaList([StopOnTokens(HARMONY_STOP_IDS)]) if (HARMONY_AVAILABLE and HARMONY_STOP_IDS) else None
+        out_ids = model.generate(
+            **inputs,
+            do_sample=bool(gen_kwargs.get("do_sample", True)),
+            temperature=float(gen_kwargs.get("temperature", 0.7)),
+            top_p=float(gen_kwargs.get("top_p", 0.9)),
+            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+            pad_token_id=model.config.pad_token_id,
+            stopping_criteria=sc,
+            repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
+            no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
+        )
+        out_list = out_ids[0].tolist()
+        gen_ids = out_list[prompt_len:]
+        # Truncate at first Harmony stop token if present
+        if HARMONY_AVAILABLE and HARMONY_STOP_IDS:
+            for sid in HARMONY_STOP_IDS:
+                if sid in gen_ids:
+                    gen_ids = gen_ids[:gen_ids.index(sid)]
+                    break
+        # Parse channels
+        if HARMONY_AVAILABLE:
+            try:
+                channels = parse_harmony_response(gen_ids)
+            except Exception:
+                decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+                channels = {"final": extract_final_channel_fallback(decoded), "raw": decoded}
+        else:
+            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+            channels = {"final": extract_final_channel_fallback(decoded), "raw": decoded}
+        # Small previews (avoid flooding logs/UI)
+        preview = {
+            "prompt_len": int(prompt_len),
+            "stop_ids": list(HARMONY_STOP_IDS) if HARMONY_AVAILABLE else [],
+            "gen_len": int(len(gen_ids)),
+            "gen_ids_head": gen_ids[:48],
+            "decoded_head": tokenizer.decode(gen_ids[:256], skip_special_tokens=False),
+            "channels": channels,
+        }
+        return preview
+    except Exception as e:
+        return {"error": f"{type(e).__name__}: {e}"}
+    finally:
+        try:
+            del model
+        except Exception:
+            pass
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 # -----------------------
 # Gradio handlers
 # -----------------------
     except Exception as e:
         return f"[Error] {type(e).__name__}: {str(e)}"
+# -----------------------
+# Extra handler: Harmony Inspector wrapper
+# -----------------------
+def harmony_inspect_handler(user_prompt: str, system_prompt: str, reasoning_effort: str):
+    try:
+        msgs = [{"role": "system", "content": system_prompt or SYSTEM_DEF}, {"role": "user", "content": user_prompt or "What is 2+2?"}]
+        prompt = create_harmony_prompt(msgs, reasoning_effort)
+        return zerogpu_generate_debug(
+            prompt,
+            {"do_sample": True, "temperature": 0.7, "top_p": 0.9, "top_k": 0, "max_new_tokens": MAX_DEF}
+        )
+    except Exception as e:
+        return {"error": f"{type(e).__name__}: {e}"}
 # -----------------------
 # UI
 # -----------------------
         )
     # Chat interface - using only valid parameters
+    # --- Harmony Inspector UI ---
+with gr.Accordion("Harmony Inspector", open=False):
+    debug_prompt = gr.Textbox(label="Debug prompt", value="What is 2+2? Reply with just the number.")
+    run_debug = gr.Button("Run Harmony Inspect")
+    debug_out = gr.JSON(label="Parsed Harmony output", value={})
+    run_debug.click(harmony_inspect_handler, inputs=[debug_prompt, system_prompt, reasoning_effort], outputs=[debug_out])
+chat = gr.ChatInterface(
         fn=generate_response,
         type="messages",
         additional_inputs=[
         cache_examples=False,
     )
+gr.Markdown(
+    """
+    ---
+    ### Configuration:
+    - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
+    - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
+    - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
+    - **Harmony**: Install with `pip install openai-harmony` for proper channel support
+    The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
+    """
+)
 if __name__ == "__main__":
     demo.queue(max_size=8 if ZEROGPU else 32).launch(