AbstractPhil commited on
Commit
a272f29
·
1 Parent(s): 73c138b
Files changed (1) hide show
  1. app.py +51 -112
app.py CHANGED
@@ -31,66 +31,14 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, toke
31
  if tokenizer.pad_token_id is None:
32
  tokenizer.pad_token_id = tokenizer.eos_token_id
33
 
34
- # -----------------------
35
- # Rose helpers
36
- # -----------------------
37
- def _parse_rose_inputs(rose_tokens: str, rose_json: str) -> Optional[Dict[str, float]]:
38
- """Merge "token:weight, ..." and JSON {token: weight} into a dict."""
39
- mapping: Dict[str, float] = {}
40
- if rose_tokens:
41
- for part in [p.strip() for p in rose_tokens.split(",") if p.strip()]:
42
- if ":" in part:
43
- k, v = part.split(":", 1)
44
- try:
45
- mapping[k.strip()] = float(v)
46
- except Exception:
47
- pass
48
- if rose_json:
49
- try:
50
- j = json.loads(rose_json)
51
- if isinstance(j, dict):
52
- for k, v in j.items():
53
- try:
54
- mapping[str(k)] = float(v)
55
- except Exception:
56
- pass
57
- except Exception:
58
- pass
59
- return mapping or None
60
-
61
- class _RoseLogits(torch.nn.Module):
62
- def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
63
- super().__init__()
64
- self.bias_vec = bias_vec
65
- self.alpha = float(alpha)
66
- def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
67
- return scores + self.alpha * self.bias_vec.to(scores.device)
68
-
69
- def _bias_from_tokens(tok, mapping: Dict[str, float]) -> torch.Tensor:
70
- bias = torch.zeros(len(tok), dtype=torch.float32)
71
- for s, w in mapping.items():
72
- tid = tok.convert_tokens_to_ids(s)
73
- if isinstance(tid, list):
74
- for t in tid:
75
- if isinstance(t, int) and t >= 0:
76
- bias[t] += float(w) / max(1, len(tid))
77
- elif isinstance(tid, int) and t >= 0:
78
- bias[tid] += float(w)
79
- return bias
80
-
81
  # -----------------------
82
  # ZeroGPU inference (GPU work ONLY inside this function)
83
  # -----------------------
84
  @spaces.GPU(duration=120)
85
- def gpu_generate(prompt_str: str,
86
- temperature: float,
87
- max_new_tokens: int,
88
- rose_tokens: str,
89
- rose_json: str,
90
- rose_alpha: float,
91
- seed: Optional[int]) -> str:
92
- """Run a single completion on GPU and return only the generated text.
93
- No Harmony. Uses chat template; slices completion by prompt length.
94
  """
95
  torch.set_grad_enabled(False)
96
  model = None
@@ -99,47 +47,46 @@ def gpu_generate(prompt_str: str,
99
  torch.manual_seed(int(seed))
100
 
101
  from peft import PeftModel
102
- # Load base model on GPU via accelerate's device_map
103
- model = AutoModelForCausalLM.from_pretrained(
104
- MODEL_ID,
105
- device_map="auto",
106
  torch_dtype="auto",
 
 
107
  trust_remote_code=True,
108
  low_cpu_mem_usage=True,
109
  token=HF_TOKEN,
110
  )
 
 
111
  if ADAPTER_ID:
112
  peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
113
  if ADAPTER_SUBFOLDER:
114
  peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
115
- model = PeftModel.from_pretrained(model, ADAPTER_ID, **peft_kwargs)
 
 
 
 
116
  model.eval()
117
  if getattr(model.config, "pad_token_id", None) is None:
118
  model.config.pad_token_id = tokenizer.pad_token_id
119
 
120
  device = next(model.parameters()).device
121
  enc = tokenizer(prompt_str, return_tensors="pt")
122
- inputs = {k: v.to(device) for k, v in enc.items()}
123
- if "attention_mask" not in inputs:
124
- inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
125
- prompt_len = int(inputs["input_ids"].shape[1])
126
-
127
- # Rose bias (optional)
128
- logits_processor = None
129
- mapping = _parse_rose_inputs(rose_tokens, rose_json)
130
- if mapping:
131
- bias = _bias_from_tokens(tokenizer, mapping).to(device)
132
- logits_processor = [_RoseLogits(bias, float(rose_alpha))]
133
-
134
- out = model.generate(
135
- **inputs,
136
- do_sample=True,
137
- temperature=float(temperature),
138
  max_new_tokens=int(max_new_tokens),
139
  pad_token_id=model.config.pad_token_id,
140
- logits_processor=logits_processor,
141
  )
142
- new_ids = out[0, prompt_len:]
143
  return tokenizer.decode(new_ids, skip_special_tokens=True)
144
  except Exception as e:
145
  return f"[Error] {type(e).__name__}: {e}"
@@ -152,28 +99,31 @@ def gpu_generate(prompt_str: str,
152
  if torch.cuda.is_available():
153
  torch.cuda.empty_cache()
154
 
155
- # -----------------------
156
- # Gradio glue (no streaming; minimal controls)
157
- # -----------------------
158
- def _build_messages(message, history) -> List[Dict[str, str]]:
159
- msgs: List[Dict[str, str]] = []
160
- # Keep it simple: prepend a small system to steady tone
161
- msgs.append({"role": "system", "content": "You are Mirel."})
162
- if isinstance(history, list):
163
- for m in history:
164
- if isinstance(m, dict) and "role" in m:
165
- msgs.append({"role": m["role"], "content": str(m.get("content", ""))})
166
- elif isinstance(m, (list, tuple)) and len(m) >= 2:
167
- u, a = m[0], m[1]
168
- if u: msgs.append({"role": "user", "content": str(u)})
169
- if a: msgs.append({"role": "assistant", "content": str(a)})
170
- if isinstance(message, dict):
171
- msgs.append({"role": message.get("role", "user"), "content": str(message.get("content", ""))})
172
- else:
173
- msgs.append({"role": "user", "content": str(message)})
174
- return msgs
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
 
 
 
 
177
  def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_tokens, rose_json, seed):
178
  try:
179
  msgs = _build_messages(message, history)
@@ -184,24 +134,13 @@ def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_
184
 
185
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
186
  gr.Markdown("""
187
- # Mirel – Rose LoRA Inference (ZeroGPU)
188
- Minimal chat using your Rose LoRA adapter. No Harmony. GPU work runs under ZeroGPU.
189
  """)
190
 
191
- with gr.Accordion("Generation", open=True):
192
- temperature = gr.Slider(0.0, 2.0, value=0.6, step=0.05, label="Temperature")
193
- max_new = gr.Slider(16, 2048, value=512, step=8, label="Max new tokens")
194
- seed = gr.Number(value=None, label="Seed (optional)", precision=0)
195
-
196
- with gr.Accordion("Rose guidance", open=False):
197
- rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
198
- rose_tokens = gr.Textbox(label="token:weight comma list", placeholder="e.g. reason:1.2, simple:-0.4", value="")
199
- rose_json = gr.Textbox(label="JSON {token: weight}", placeholder='{"reason": 1.0, "ramble": -0.8}', value="")
200
-
201
  gr.ChatInterface(
202
  fn=ui_generate,
203
  type="messages",
204
- additional_inputs=[temperature, max_new, rose_alpha, rose_tokens, rose_json, seed],
205
  title="Mirel",
206
  cache_examples=False,
207
  )
 
31
  if tokenizer.pad_token_id is None:
32
  tokenizer.pad_token_id = tokenizer.eos_token_id
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # -----------------------
35
  # ZeroGPU inference (GPU work ONLY inside this function)
36
  # -----------------------
37
  @spaces.GPU(duration=120)
38
+ def gpu_generate(prompt_str: str, seed: Optional[int] = None, max_new_tokens: int = 512) -> str:
39
+ """Minimal generation using GPT-OSS-20B + Rose LoRA.
40
+ - All CUDA work stays inside this function (ZeroGPU-safe).
41
+ - No Harmony, no extra knobs; rely on model defaults.
 
 
 
 
 
42
  """
43
  torch.set_grad_enabled(False)
44
  model = None
 
47
  torch.manual_seed(int(seed))
48
 
49
  from peft import PeftModel
50
+
51
+ model_kwargs = dict(
52
+ attn_implementation="eager",
 
53
  torch_dtype="auto",
54
+ use_cache=True,
55
+ device_map="auto",
56
  trust_remote_code=True,
57
  low_cpu_mem_usage=True,
58
  token=HF_TOKEN,
59
  )
60
+ base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
61
+
62
  if ADAPTER_ID:
63
  peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
64
  if ADAPTER_SUBFOLDER:
65
  peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
66
+ peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID, **peft_kwargs)
67
+ model = peft_model.merge_and_unload()
68
+ else:
69
+ model = base_model
70
+
71
  model.eval()
72
  if getattr(model.config, "pad_token_id", None) is None:
73
  model.config.pad_token_id = tokenizer.pad_token_id
74
 
75
  device = next(model.parameters()).device
76
  enc = tokenizer(prompt_str, return_tensors="pt")
77
+ input_ids = enc["input_ids"].to(device)
78
+ attention_mask = enc.get("attention_mask")
79
+ if attention_mask is None:
80
+ attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
81
+
82
+ prompt_len = int(input_ids.shape[-1])
83
+ output_ids = model.generate(
84
+ input_ids=input_ids,
85
+ attention_mask=attention_mask,
 
 
 
 
 
 
 
86
  max_new_tokens=int(max_new_tokens),
87
  pad_token_id=model.config.pad_token_id,
 
88
  )
89
+ new_ids = output_ids[0, prompt_len:]
90
  return tokenizer.decode(new_ids, skip_special_tokens=True)
91
  except Exception as e:
92
  return f"[Error] {type(e).__name__}: {e}"
 
99
  if torch.cuda.is_available():
100
  torch.cuda.empty_cache()
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ def ui_generate(message, history):
104
+ try:
105
+ # ChatInterface(type='messages') gives OpenAI-style dicts.
106
+ msgs: List[Dict[str, str]] = []
107
+ # Keep defaults: no explicit system beyond template defaults
108
+ if isinstance(history, list):
109
+ for m in history:
110
+ if isinstance(m, dict) and "role" in m:
111
+ msgs.append({"role": m.get("role", "user"), "content": str(m.get("content", ""))})
112
+ elif isinstance(m, (list, tuple)) and len(m) >= 2:
113
+ u, a = m[0], m[1]
114
+ if u:
115
+ msgs.append({"role": "user", "content": str(u)})
116
+ if a:
117
+ msgs.append({"role": "assistant", "content": str(a)})
118
+ if isinstance(message, dict):
119
+ msgs.append({"role": message.get("role", "user"), "content": str(message.get("content", ""))})
120
+ else:
121
+ msgs.append({"role": "user", "content": str(message)})
122
 
123
+ prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
124
+ return gpu_generate(prompt)
125
+ except Exception as e:
126
+ return f"[Error] {type(e).__name__}: {e}"
127
  def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_tokens, rose_json, seed):
128
  try:
129
  msgs = _build_messages(message, history)
 
134
 
135
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
136
  gr.Markdown("""
137
+ # Mirel – Rose LoRA (ZeroGPU, minimal)
138
+ Loads GPT‑OSS‑20B + Rose LoRA and generates with default settings.
139
  """)
140
 
 
 
 
 
 
 
 
 
 
 
141
  gr.ChatInterface(
142
  fn=ui_generate,
143
  type="messages",
 
144
  title="Mirel",
145
  cache_examples=False,
146
  )