AbstractPhil commited on
Commit
bbabb73
·
1 Parent(s): 40292d5

finally got claude to add harmony format

Browse files
Files changed (2) hide show
  1. app.py +277 -253
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,17 +1,36 @@
1
  """
2
  Mirel Harmony Inference – HF Space (Gradio)
3
  ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
4
- Chain-of-thought model with proper channel extraction
5
  Single file: app.py
6
  """
7
  from __future__ import annotations
8
- import os, gc, json, threading, torch, traceback
9
  from dataclasses import dataclass
10
- from typing import List, Dict, Optional, Any, Iterator
 
11
  import gradio as gr
12
  import spaces # required for ZeroGPU
13
  from transformers import AutoTokenizer, AutoModelForCausalLM
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # -----------------------
16
  # Config & runtime modes
17
  # -----------------------
@@ -23,11 +42,14 @@ ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
23
  ATTN_IMPL = os.getenv("ATTN_IMPL", "eager")
24
  DTYPE = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
25
  SYSTEM_DEF = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
26
- MAX_DEF = int(os.getenv("MAX_NEW_TOKENS", "512"))
27
- ZEROGPU = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "1")) == "1"
28
  LOAD_4BIT = os.getenv("LOAD_4BIT", "0") == "1"
29
 
30
- # HF Auth - check for token in environment or use OAuth
 
 
 
31
  HF_TOKEN: Optional[str] = (
32
  os.getenv("HF_TOKEN")
33
  or os.getenv("HUGGING_FACE_HUB_TOKEN")
@@ -35,20 +57,34 @@ HF_TOKEN: Optional[str] = (
35
  or os.getenv("HF_ACCESS_TOKEN")
36
  )
37
 
38
- # For private model access via token (if not using OAuth)
39
- if HF_TOKEN:
40
- try:
41
- from huggingface_hub import login
42
- login(token=HF_TOKEN, add_to_git_credential=True)
43
- print(f"[HF Auth] Using token from environment")
44
- except Exception as e:
45
- print(f"[HF Auth] Token login failed: {e}")
46
- else:
47
- print("[HF Auth] No token in environment - OAuth will be available in UI")
 
 
 
 
 
 
 
 
48
 
49
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
50
 
51
- # Tokenizer is lightweight; load once (pass token for private models)
 
 
 
 
 
 
52
  try:
53
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
54
  print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
@@ -57,11 +93,8 @@ except Exception as e:
57
  raise
58
 
59
  # -----------------------
60
- # Lazy model loader (ZeroGPU-friendly)
61
  # -----------------------
62
- _model: Optional[AutoModelForCausalLM] = None
63
- _model_lock = threading.Lock()
64
-
65
  try:
66
  from peft import PeftModel
67
  _HAS_PEFT = True
@@ -76,12 +109,11 @@ def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
76
  attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
77
  trust_remote_code=True,
78
  low_cpu_mem_usage=True,
79
- token=HF_TOKEN, # Add token here for private model access
80
  )
81
- # Only enable 4-bit when not explicitly CPU-bound
82
  if LOAD_4BIT and device_map != "cpu":
83
  try:
84
- import bitsandbytes as _bnb # noqa: F401
85
  kw.update(load_in_4bit=True)
86
  if kw["device_map"] is None:
87
  kw["device_map"] = "auto"
@@ -109,30 +141,88 @@ def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
109
  return model
110
 
111
  # -----------------------
112
- # Harmony formatting & CoT extraction
113
  # -----------------------
114
 
115
- def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
116
- """
117
- Strict Harmony: rely on the tokenizer's official chat template.
118
- """
119
- tmpl = getattr(tokenizer, "chat_template", None)
120
- if not tmpl:
121
- raise RuntimeError(
122
- "Missing Harmony chat_template on this tokenizer. Use a Harmony-enabled repo (e.g., openai/gpt-oss-20b)."
123
- )
124
- return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- def extract_final_channel(text: str) -> str:
127
- """
128
- Extract the final channel from chain-of-thought output.
129
- The model outputs thinking in internal channels and final response in final channel.
130
- """
131
  # Look for the final channel marker
132
  final_marker = "<|channel|>final<|message|>"
133
 
134
  if final_marker in text:
135
- # Extract everything after the final channel marker
136
  parts = text.split(final_marker)
137
  if len(parts) > 1:
138
  final_text = parts[-1]
@@ -145,16 +235,15 @@ def extract_final_channel(text: str) -> str:
145
 
146
  return final_text.strip()
147
 
148
- # If no channel markers found, return the cleaned text
149
- # (might be a non-CoT response or error)
150
  return text.strip()
151
 
152
  # -----------------------
153
- # Optional Rose guidance (logits bias)
154
  # -----------------------
155
 
156
  def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
157
- """Create vocab bias from {token: weight}. Unknown tokens ignored."""
158
  vocab_size = len(tokenizer)
159
  bias = torch.zeros(vocab_size, dtype=torch.float32)
160
  for tok, w in mapping.items():
@@ -178,168 +267,110 @@ class RoseGuidedLogits(torch.nn.Module):
178
  def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
179
  return scores + self.alpha * self.bias_vec.to(scores.device)
180
 
181
- # Use appropriate decorator based on whether ZeroGPU is enabled
182
- if ZEROGPU:
183
- @spaces.GPU(duration=120)
184
- def zerogpu_generate(full_prompt: str,
185
- gen_kwargs: Dict[str, Any],
186
- rose_map: Optional[Dict[str, float]],
187
- rose_alpha: float,
188
- rose_score: Optional[float],
189
- seed: Optional[int]) -> str:
190
- """Run inference on GPU (ZeroGPU-safe)."""
191
- try:
192
- if seed is not None:
193
- torch.manual_seed(int(seed))
194
 
195
- # Load model
196
- model = _load_model_on("auto")
197
-
198
- # Setup logits processor for Rose guidance
199
- logits_processor = None
200
- if rose_map:
201
- bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
202
- eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
203
- logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
204
-
205
- # Tokenize input
206
- inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
207
-
208
- # Non-streaming generation
209
- out_ids = model.generate(
210
- **inputs,
211
- do_sample=bool(gen_kwargs.get("do_sample", True)),
212
- temperature=float(gen_kwargs.get("temperature", 0.7)),
213
- top_p=float(gen_kwargs.get("top_p", 0.9)),
214
- top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
215
- max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
216
- pad_token_id=tokenizer.eos_token_id,
217
- eos_token_id=tokenizer.eos_token_id,
218
- logits_processor=logits_processor,
219
- )
220
-
221
- # Decode the full output (including special tokens for CoT)
222
- prompt_len = int(inputs["input_ids"].shape[1])
223
- gen_ids = out_ids[0][prompt_len:]
 
 
 
 
 
 
224
  decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
 
 
 
 
 
 
225
 
226
- return decoded
227
-
228
- except Exception as e:
229
- error_msg = f"Generation failed: {str(e)}"
230
- print(f"[Error] {error_msg}")
231
- print(traceback.format_exc())
232
- return error_msg
233
- finally:
234
- # Cleanup
235
- try:
236
- del model
237
- except:
238
- pass
239
- gc.collect()
240
- if torch.cuda.is_available():
241
- torch.cuda.empty_cache()
242
- else:
243
- def zerogpu_generate(full_prompt: str,
244
- gen_kwargs: Dict[str, Any],
245
- rose_map: Optional[Dict[str, float]],
246
- rose_alpha: float,
247
- rose_score: Optional[float],
248
- seed: Optional[int]) -> str:
249
- """Run inference without ZeroGPU decorator."""
250
- # Same implementation as above but without the decorator
251
  try:
252
- if seed is not None:
253
- torch.manual_seed(int(seed))
254
-
255
-
256
- # Load model
257
- model = _load_model_on("auto" if torch.cuda.is_available() else "cpu")
258
-
259
- # Setup logits processor for Rose guidance
260
- logits_processor = None
261
- if rose_map:
262
- bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
263
- eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
264
- logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
265
-
266
- # Tokenize input
267
- inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
268
-
269
- # Non-streaming generation
270
- out_ids = model.generate(
271
- **inputs,
272
- do_sample=bool(gen_kwargs.get("do_sample", True)),
273
- temperature=float(gen_kwargs.get("temperature", 0.7)),
274
- top_p=float(gen_kwargs.get("top_p", 0.9)),
275
- top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
276
- max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
277
- pad_token_id=tokenizer.eos_token_id,
278
- eos_token_id=tokenizer.eos_token_id,
279
- logits_processor=logits_processor,
280
- )
281
-
282
- # Decode the full output (including special tokens for CoT)
283
- prompt_len = int(inputs["input_ids"].shape[1])
284
- gen_ids = out_ids[0][prompt_len:]
285
- decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
286
-
287
- return decoded
288
-
289
- except Exception as e:
290
- error_msg = f"Generation failed: {str(e)}"
291
- print(f"[Error] {error_msg}")
292
- print(traceback.format_exc())
293
- return error_msg
294
- finally:
295
- # Cleanup
296
- try:
297
- del model
298
- except:
299
- pass
300
- gc.collect()
301
- if torch.cuda.is_available():
302
- torch.cuda.empty_cache()
303
 
304
  # -----------------------
305
  # Gradio handlers
306
  # -----------------------
307
 
308
- def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, str]]:
309
- msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
310
- for item in history:
311
- if not item:
312
- continue
313
- if isinstance(item, dict) and "role" in item:
314
- msgs.append(item)
315
- continue
316
- if isinstance(item, (list, tuple)) and len(item) == 2:
317
- u, a = item
318
- if u is not None:
319
- msgs.append({"role": "user", "content": str(u)})
320
- if a:
321
- msgs.append({"role": "assistant", "content": str(a)})
322
- return msgs
323
-
324
- def generate_response(message: Any, history: List[Any], system_prompt: str,
325
- temperature: float, top_p: float, top_k: int, max_new_tokens: int,
326
- do_sample: bool, seed: Optional[int],
327
- rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
328
- rose_tokens: str, rose_json: str,
329
- show_thinking: bool = False) -> str:
330
  """
331
- Non-streaming response generator for ChatInterface.
332
- Returns a complete response to avoid h11 Content-Length issues.
333
  """
334
  try:
335
- # Normalize message and build Harmony prompt
336
- if isinstance(message, dict):
337
- message = message.get("content", "")
338
 
339
- msgs = chat_to_messages(history, system_prompt)
340
- msgs.append({"role": "user", "content": str(message)})
 
 
 
 
 
 
 
341
 
342
- prompt = to_harmony_prompt(msgs)
 
 
 
 
 
 
 
 
343
 
344
  # Build Rose map if enabled
345
  rose_map: Optional[Dict[str, float]] = None
@@ -367,9 +398,9 @@ def generate_response(message: Any, history: List[Any], system_prompt: str,
367
  pass
368
  if not rose_map:
369
  rose_map = None
370
-
371
  # Generate with model
372
- full_output = zerogpu_generate(
373
  prompt,
374
  {
375
  "do_sample": bool(do_sample),
@@ -384,57 +415,43 @@ def generate_response(message: Any, history: List[Any], system_prompt: str,
384
  int(seed) if seed is not None else None,
385
  )
386
 
387
- # Extract final response from CoT output
388
  if show_thinking:
389
- # Show the full chain-of-thought process
390
- return f"**Full Output (with thinking):**\n```\n{full_output}\n```\n\n**Final Response:**\n{extract_final_channel(full_output)}"
 
 
 
 
 
391
  else:
392
  # Just show the final response
393
- return extract_final_channel(full_output)
394
 
395
  except Exception as e:
396
- error_msg = f"⚠️ Error: {str(e)}"
397
- print(f"[Error in generate_response] {error_msg}")
398
- print(traceback.format_exc())
399
- return error_msg
400
 
401
  # -----------------------
402
  # UI
403
  # -----------------------
404
- css = """
405
- #chatbot {
406
- height: 500px;
407
- }
408
- .gradio-container {
409
- max-width: 1200px !important;
410
- }
411
- """
412
-
413
- with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
414
  gr.Markdown(
415
  """
416
- # Mirel – Harmony Inference (ZeroGPU-ready)
417
 
418
- Chain-of-thought OSS-20B model with Harmony formatting.
419
- The model thinks through problems internally before providing a final response.
420
 
421
- **Auth:** Set `HF_TOKEN` in Space secrets or add `hf_oauth: true` to README for browser auth.
422
  """
423
  )
424
 
425
  with gr.Row():
426
- with gr.Column(scale=3):
427
- system_prompt = gr.Textbox(
428
- label="System Prompt",
429
- value=SYSTEM_DEF,
430
- lines=2
431
- )
432
- with gr.Column(scale=1):
433
- show_thinking = gr.Checkbox(
434
- value=False,
435
- label="Show thinking process",
436
- info="Display internal CoT reasoning"
437
- )
438
 
439
  with gr.Accordion("Generation Settings", open=False):
440
  with gr.Row():
@@ -445,6 +462,18 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
445
  max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
446
  do_sample = gr.Checkbox(value=True, label="Do sample")
447
  seed = gr.Number(value=None, label="Seed (optional)", precision=0)
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  with gr.Accordion("Rose Guidance (Optional)", open=False):
450
  gr.Markdown("Fine-tune generation with token biases")
@@ -466,46 +495,41 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
466
  # Chat interface
467
  chat = gr.ChatInterface(
468
  fn=generate_response,
469
- chatbot=gr.Chatbot(elem_id="chatbot", height=500, type="messages"),
470
  additional_inputs=[
471
  system_prompt, temperature, top_p, top_k, max_new,
472
  do_sample, seed, rose_enable, rose_alpha, rose_score,
473
- rose_tokens, rose_json, show_thinking
 
 
 
 
 
 
 
 
474
  ],
475
- title=None, # Title already in markdown
476
- description=None, # Description already in markdown
477
  cache_examples=False,
 
 
 
478
  )
479
 
480
  gr.Markdown(
481
  """
482
  ---
483
- ### Configuration Notes:
484
-
485
- **Authentication Options:**
486
- 1. **Browser OAuth**: Click "Sign in with Hugging Face" above (easiest)
487
- 2. **Environment Token**: Set `HF_TOKEN` in Space secrets
488
- 3. **No Auth**: Works for public models only
489
-
490
- **Important:** For OAuth to work in Spaces, add `hf_oauth: true` to your README.md metadata
491
-
492
- **Other Settings:**
493
  - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
494
- - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER` for PEFT adapters
495
- - **ZeroGPU**: Set `ZEROGPU=1` for Spaces with ZeroGPU (default: enabled)
496
- - **4-bit**: Set `LOAD_4BIT=1` to enable 4-bit quantization
497
 
498
- The model uses internal "thinking" channels before producing a final response.
499
- Enable "Show thinking process" to see the full chain-of-thought.
500
  """
501
  )
502
 
503
  if __name__ == "__main__":
504
- # Simple queue configuration
505
- demo.queue(
506
- max_size=10,
507
- ).launch(
508
  server_name="0.0.0.0",
509
  server_port=7860,
510
- share=False,
511
  )
 
1
  """
2
  Mirel Harmony Inference – HF Space (Gradio)
3
  ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
4
+ Chain-of-thought model with proper channel extraction using openai_harmony
5
  Single file: app.py
6
  """
7
  from __future__ import annotations
8
+ import os, gc, json, threading, torch
9
  from dataclasses import dataclass
10
+ from typing import List, Dict, Optional, Any
11
+ from datetime import datetime
12
  import gradio as gr
13
  import spaces # required for ZeroGPU
14
  from transformers import AutoTokenizer, AutoModelForCausalLM
15
 
16
+ # Import Harmony components
17
+ try:
18
+ from openai_harmony import (
19
+ Author,
20
+ Conversation,
21
+ HarmonyEncodingName,
22
+ Message,
23
+ Role,
24
+ SystemContent,
25
+ DeveloperContent,
26
+ load_harmony_encoding,
27
+ ReasoningEffort
28
+ )
29
+ HARMONY_AVAILABLE = True
30
+ except ImportError:
31
+ print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
32
+ HARMONY_AVAILABLE = False
33
+
34
  # -----------------------
35
  # Config & runtime modes
36
  # -----------------------
 
42
  ATTN_IMPL = os.getenv("ATTN_IMPL", "eager")
43
  DTYPE = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
44
  SYSTEM_DEF = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
45
+ MAX_DEF = int(os.getenv("MAX_NEW_TOKENS", "1024"))
46
+ ZEROGPU = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
47
  LOAD_4BIT = os.getenv("LOAD_4BIT", "0") == "1"
48
 
49
+ # Harmony channels for CoT
50
+ REQUIRED_CHANNELS = ["thinking", "analysis", "final"]
51
+
52
+ # HF Auth - properly handle multiple token env var names
53
  HF_TOKEN: Optional[str] = (
54
  os.getenv("HF_TOKEN")
55
  or os.getenv("HUGGING_FACE_HUB_TOKEN")
 
57
  or os.getenv("HF_ACCESS_TOKEN")
58
  )
59
 
60
+ def _hf_login() -> None:
61
+ """Login to HF Hub using common env secret names."""
62
+ if HF_TOKEN:
63
+ try:
64
+ from huggingface_hub import login, whoami
65
+ login(token=HF_TOKEN, add_to_git_credential=True)
66
+ try:
67
+ who = whoami(token=HF_TOKEN)
68
+ print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
69
+ except Exception:
70
+ print("[HF Auth] Login successful but couldn't get user info")
71
+ except Exception as e:
72
+ print(f"[HF Auth] Login failed: {e}")
73
+ else:
74
+ print("[HF Auth] No token found in environment variables")
75
+
76
+ # Login before loading any models
77
+ _hf_login()
78
 
79
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
80
 
81
+ # Load Harmony encoding if available
82
+ if HARMONY_AVAILABLE:
83
+ harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
84
+ else:
85
+ harmony_encoding = None
86
+
87
+ # Tokenizer is lightweight; load once
88
  try:
89
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
90
  print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
 
93
  raise
94
 
95
  # -----------------------
96
+ # Model loading
97
  # -----------------------
 
 
 
98
  try:
99
  from peft import PeftModel
100
  _HAS_PEFT = True
 
109
  attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
110
  trust_remote_code=True,
111
  low_cpu_mem_usage=True,
112
+ token=HF_TOKEN,
113
  )
 
114
  if LOAD_4BIT and device_map != "cpu":
115
  try:
116
+ import bitsandbytes as _bnb
117
  kw.update(load_in_4bit=True)
118
  if kw["device_map"] is None:
119
  kw["device_map"] = "auto"
 
141
  return model
142
 
143
  # -----------------------
144
+ # Harmony formatting
145
  # -----------------------
146
 
147
+ def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> str:
148
+ """Create a proper Harmony-formatted prompt using openai_harmony."""
149
+ if not HARMONY_AVAILABLE:
150
+ # Fallback to tokenizer's chat template
151
+ return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
152
+
153
+ # Map reasoning effort
154
+ effort_map = {
155
+ "low": ReasoningEffort.LOW,
156
+ "medium": ReasoningEffort.MEDIUM,
157
+ "high": ReasoningEffort.HIGH,
158
+ }
159
+ effort = effort_map.get(reasoning_effort.lower(), ReasoningEffort.HIGH)
160
+
161
+ # Create system message with channels
162
+ system_content = (
163
+ SystemContent.new()
164
+ .with_model_identity(messages[0]["content"] if messages else SYSTEM_DEF)
165
+ .with_reasoning_effort(effort)
166
+ .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
167
+ .with_knowledge_cutoff("2025-01")
168
+ .with_required_channels(REQUIRED_CHANNELS)
169
+ )
170
+
171
+ # Build conversation
172
+ harmony_messages = [
173
+ Message.from_role_and_content(Role.SYSTEM, system_content)
174
+ ]
175
+
176
+ # Add user/assistant messages
177
+ for msg in messages[1:]: # Skip system message as we already added it
178
+ if msg["role"] == "user":
179
+ harmony_messages.append(
180
+ Message.from_role_and_content(Role.USER, msg["content"])
181
+ )
182
+ elif msg["role"] == "assistant":
183
+ # For assistant messages, we might want to preserve channels if they exist
184
+ harmony_messages.append(
185
+ Message.from_role_and_content(Role.ASSISTANT, msg["content"])
186
+ .with_channel("final") # Default to final channel
187
+ )
188
+
189
+ # Create conversation and render
190
+ convo = Conversation.from_messages(harmony_messages)
191
+ tokens = harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
192
+
193
+ # Convert tokens back to text for the model
194
+ return tokenizer.decode(tokens)
195
+
196
+ def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
197
+ """Parse response tokens using Harmony format to extract channels."""
198
+ if not HARMONY_AVAILABLE:
199
+ # Fallback: just decode and extract final channel manually
200
+ text = tokenizer.decode(tokens, skip_special_tokens=False)
201
+ return {"final": extract_final_channel_fallback(text), "raw": text}
202
+
203
+ # Parse messages from completion tokens
204
+ parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
205
+
206
+ # Extract content by channel
207
+ channels = {}
208
+ for msg in parsed_messages:
209
+ channel = msg.channel if hasattr(msg, 'channel') else "final"
210
+ if channel not in channels:
211
+ channels[channel] = ""
212
+ channels[channel] += msg.content
213
+
214
+ # Ensure we have a final channel
215
+ if "final" not in channels:
216
+ channels["final"] = " ".join(channels.values())
217
+
218
+ return channels
219
 
220
+ def extract_final_channel_fallback(text: str) -> str:
221
+ """Fallback extraction when harmony library isn't available."""
 
 
 
222
  # Look for the final channel marker
223
  final_marker = "<|channel|>final<|message|>"
224
 
225
  if final_marker in text:
 
226
  parts = text.split(final_marker)
227
  if len(parts) > 1:
228
  final_text = parts[-1]
 
235
 
236
  return final_text.strip()
237
 
238
+ # If no channel markers found, return cleaned text
 
239
  return text.strip()
240
 
241
  # -----------------------
242
+ # Rose guidance
243
  # -----------------------
244
 
245
  def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
246
+ """Create vocab bias from {token: weight}."""
247
  vocab_size = len(tokenizer)
248
  bias = torch.zeros(vocab_size, dtype=torch.float32)
249
  for tok, w in mapping.items():
 
267
  def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
268
  return scores + self.alpha * self.bias_vec.to(scores.device)
269
 
270
+ @spaces.GPU(duration=120)
271
+ def zerogpu_generate(full_prompt: str,
272
+ gen_kwargs: Dict[str, Any],
273
+ rose_map: Optional[Dict[str, float]],
274
+ rose_alpha: float,
275
+ rose_score: Optional[float],
276
+ seed: Optional[int]) -> Dict[str, str]:
277
+ """Run inference on GPU and return parsed channels."""
278
+ try:
279
+ if seed is not None:
280
+ torch.manual_seed(int(seed))
 
 
281
 
282
+ # Load model
283
+ model = _load_model_on("auto")
284
+
285
+ # Setup logits processor for Rose guidance
286
+ logits_processor = None
287
+ if rose_map:
288
+ bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
289
+ eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
290
+ logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
291
+
292
+ # Tokenize input
293
+ inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
294
+
295
+ # Generate
296
+ out_ids = model.generate(
297
+ **inputs,
298
+ do_sample=bool(gen_kwargs.get("do_sample", True)),
299
+ temperature=float(gen_kwargs.get("temperature", 0.7)),
300
+ top_p=float(gen_kwargs.get("top_p", 0.9)),
301
+ top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
302
+ max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
303
+ pad_token_id=tokenizer.eos_token_id,
304
+ eos_token_id=tokenizer.eos_token_id,
305
+ logits_processor=logits_processor,
306
+ )
307
+
308
+ # Extract generated tokens only
309
+ prompt_len = int(inputs["input_ids"].shape[1])
310
+ gen_ids = out_ids[0][prompt_len:].tolist()
311
+
312
+ # Parse response with Harmony
313
+ if HARMONY_AVAILABLE:
314
+ channels = parse_harmony_response(gen_ids)
315
+ else:
316
+ # Fallback
317
  decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
318
+ channels = {
319
+ "final": extract_final_channel_fallback(decoded),
320
+ "raw": decoded
321
+ }
322
+
323
+ return channels
324
 
325
+ except Exception as e:
326
+ return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
327
+ finally:
328
+ # Cleanup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  try:
330
+ del model
331
+ except:
332
+ pass
333
+ gc.collect()
334
+ if torch.cuda.is_available():
335
+ torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  # -----------------------
338
  # Gradio handlers
339
  # -----------------------
340
 
341
+ def generate_response(message: str, history: List[List[str]], system_prompt: str,
342
+ temperature: float, top_p: float, top_k: int, max_new_tokens: int,
343
+ do_sample: bool, seed: Optional[int],
344
+ rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
345
+ rose_tokens: str, rose_json: str,
346
+ show_thinking: bool = False,
347
+ reasoning_effort: str = "high") -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  """
349
+ Generate response with proper CoT handling using Harmony format.
 
350
  """
351
  try:
352
+ # Build message list
353
+ messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
 
354
 
355
+ # Add history
356
+ if history:
357
+ for turn in history:
358
+ if isinstance(turn, (list, tuple)) and len(turn) >= 2:
359
+ user_msg, assistant_msg = turn[0], turn[1]
360
+ if user_msg:
361
+ messages.append({"role": "user", "content": str(user_msg)})
362
+ if assistant_msg:
363
+ messages.append({"role": "assistant", "content": str(assistant_msg)})
364
 
365
+ # Add current message
366
+ messages.append({"role": "user", "content": str(message)})
367
+
368
+ # Create Harmony-formatted prompt
369
+ if HARMONY_AVAILABLE:
370
+ prompt = create_harmony_prompt(messages, reasoning_effort)
371
+ else:
372
+ # Fallback to tokenizer template
373
+ prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
374
 
375
  # Build Rose map if enabled
376
  rose_map: Optional[Dict[str, float]] = None
 
398
  pass
399
  if not rose_map:
400
  rose_map = None
401
+
402
  # Generate with model
403
+ channels = zerogpu_generate(
404
  prompt,
405
  {
406
  "do_sample": bool(do_sample),
 
415
  int(seed) if seed is not None else None,
416
  )
417
 
418
+ # Format response
419
  if show_thinking:
420
+ # Show all channels
421
+ response = "## Chain of Thought:\n\n"
422
+ for channel, content in channels.items():
423
+ if channel != "final" and content:
424
+ response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
425
+ response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
426
+ return response
427
  else:
428
  # Just show the final response
429
+ return channels.get("final", "No final response generated")
430
 
431
  except Exception as e:
432
+ return f"[Error] {type(e).__name__}: {str(e)}"
 
 
 
433
 
434
  # -----------------------
435
  # UI
436
  # -----------------------
437
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
438
  gr.Markdown(
439
  """
440
+ # Mirel – Harmony Chain-of-Thought Inference
441
 
442
+ OSS-20B model using Harmony format with thinking channels.
443
+ The model thinks through problems in internal channels before providing a final response.
444
 
445
+ **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
446
  """
447
  )
448
 
449
  with gr.Row():
450
+ system_prompt = gr.Textbox(
451
+ label="System Prompt",
452
+ value=SYSTEM_DEF,
453
+ lines=2
454
+ )
 
 
 
 
 
 
 
455
 
456
  with gr.Accordion("Generation Settings", open=False):
457
  with gr.Row():
 
462
  max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
463
  do_sample = gr.Checkbox(value=True, label="Do sample")
464
  seed = gr.Number(value=None, label="Seed (optional)", precision=0)
465
+ with gr.Row():
466
+ reasoning_effort = gr.Radio(
467
+ choices=["low", "medium", "high"],
468
+ value="high",
469
+ label="Reasoning Effort",
470
+ info="How much thinking the model should do"
471
+ )
472
+ show_thinking = gr.Checkbox(
473
+ value=False,
474
+ label="Show thinking channels",
475
+ info="Display all internal reasoning channels"
476
+ )
477
 
478
  with gr.Accordion("Rose Guidance (Optional)", open=False):
479
  gr.Markdown("Fine-tune generation with token biases")
 
495
  # Chat interface
496
  chat = gr.ChatInterface(
497
  fn=generate_response,
 
498
  additional_inputs=[
499
  system_prompt, temperature, top_p, top_k, max_new,
500
  do_sample, seed, rose_enable, rose_alpha, rose_score,
501
+ rose_tokens, rose_json, show_thinking, reasoning_effort
502
+ ],
503
+ title="Chat with Mirel",
504
+ description="A chain-of-thought model using Harmony format",
505
+ examples=[
506
+ ["Hello! Can you introduce yourself?"],
507
+ ["What is the capital of France?"],
508
+ ["Explain quantum computing in simple terms"],
509
+ ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
510
  ],
 
 
511
  cache_examples=False,
512
+ retry_btn="Retry",
513
+ undo_btn="Undo",
514
+ clear_btn="Clear",
515
  )
516
 
517
  gr.Markdown(
518
  """
519
  ---
520
+ ### Configuration:
 
 
 
 
 
 
 
 
 
521
  - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
522
+ - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
523
+ - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
524
+ - **Harmony**: Install with `pip install openai-harmony` for proper channel support
525
 
526
+ The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
 
527
  """
528
  )
529
 
530
  if __name__ == "__main__":
531
+ demo.queue(max_size=8 if ZEROGPU else 32).launch(
 
 
 
532
  server_name="0.0.0.0",
533
  server_port=7860,
534
+ share=False
535
  )
requirements.txt CHANGED
@@ -4,4 +4,5 @@ accelerate>=0.33.0
4
  peft>=0.11.0
5
  gradio>=5.38.0
6
  torch>=2.4.0 # ZeroGPU-supported (2.3.x is NOT supported)
7
- bitsandbytes>=0.43.1
 
 
4
  peft>=0.11.0
5
  gradio>=5.38.0
6
  torch>=2.4.0 # ZeroGPU-supported (2.3.x is NOT supported)
7
+ bitsandbytes>=0.43.1
8
+ openai_harmony