AbstractPhil commited on
Commit
7e0577a
·
1 Parent(s): a272f29
Files changed (1) hide show
  1. app.py +39 -94
app.py CHANGED
@@ -1,22 +1,14 @@
1
- """
2
- Mirel – Minimal Rose LoRA Inference (HF Space)
3
- ZeroGPU-only, no Harmony, no extra config
4
- Single file: app.py
5
- """
6
  from __future__ import annotations
7
- import os, gc, json, torch
8
  from typing import Optional, Dict, Any, List
9
  import gradio as gr
10
  import spaces
11
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
12
 
13
- # -----------------------
14
- # Constants / Env
15
- # -----------------------
16
- MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
17
- # Default to your Rose LoRA
18
- ADAPTER_ID = os.getenv("ADAPTER_ID", "AbstractPhil/mirel-gpt-oss-20b")
19
- ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoints/checkpoint-516")
20
  HF_TOKEN: Optional[str] = (
21
  os.getenv("HF_TOKEN")
22
  or os.getenv("HUGGING_FACE_HUB_TOKEN")
@@ -26,28 +18,16 @@ HF_TOKEN: Optional[str] = (
26
 
27
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
28
 
29
- # Tokenizer is lightweight; OK to load on CPU at import time
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
31
  if tokenizer.pad_token_id is None:
32
  tokenizer.pad_token_id = tokenizer.eos_token_id
33
 
34
- # -----------------------
35
- # ZeroGPU inference (GPU work ONLY inside this function)
36
- # -----------------------
37
  @spaces.GPU(duration=120)
38
- def gpu_generate(prompt_str: str, seed: Optional[int] = None, max_new_tokens: int = 512) -> str:
39
- """Minimal generation using GPT-OSS-20B + Rose LoRA.
40
- - All CUDA work stays inside this function (ZeroGPU-safe).
41
- - No Harmony, no extra knobs; rely on model defaults.
42
- """
43
  torch.set_grad_enabled(False)
44
  model = None
45
  try:
46
- if seed is not None:
47
- torch.manual_seed(int(seed))
48
-
49
- from peft import PeftModel
50
-
51
  model_kwargs = dict(
52
  attn_implementation="eager",
53
  torch_dtype="auto",
@@ -58,92 +38,57 @@ def gpu_generate(prompt_str: str, seed: Optional[int] = None, max_new_tokens: in
58
  token=HF_TOKEN,
59
  )
60
  base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
61
-
62
- if ADAPTER_ID:
63
- peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
64
- if ADAPTER_SUBFOLDER:
65
- peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
66
- peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID, **peft_kwargs)
67
- model = peft_model.merge_and_unload()
68
- else:
69
- model = base_model
70
-
71
  model.eval()
72
- if getattr(model.config, "pad_token_id", None) is None:
73
- model.config.pad_token_id = tokenizer.pad_token_id
74
 
75
- device = next(model.parameters()).device
76
  enc = tokenizer(prompt_str, return_tensors="pt")
77
- input_ids = enc["input_ids"].to(device)
78
- attention_mask = enc.get("attention_mask")
79
- if attention_mask is None:
80
- attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
81
 
82
- prompt_len = int(input_ids.shape[-1])
83
  output_ids = model.generate(
84
  input_ids=input_ids,
85
  attention_mask=attention_mask,
86
- max_new_tokens=int(max_new_tokens),
87
- pad_token_id=model.config.pad_token_id,
88
  )
89
  new_ids = output_ids[0, prompt_len:]
90
  return tokenizer.decode(new_ids, skip_special_tokens=True)
91
  except Exception as e:
92
  return f"[Error] {type(e).__name__}: {e}"
93
  finally:
94
- try:
95
- del model
96
- except Exception:
97
- pass
98
  gc.collect()
99
  if torch.cuda.is_available():
100
  torch.cuda.empty_cache()
101
 
102
-
103
  def ui_generate(message, history):
104
- try:
105
- # ChatInterface(type='messages') gives OpenAI-style dicts.
106
- msgs: List[Dict[str, str]] = []
107
- # Keep defaults: no explicit system beyond template defaults
108
- if isinstance(history, list):
109
- for m in history:
110
- if isinstance(m, dict) and "role" in m:
111
- msgs.append({"role": m.get("role", "user"), "content": str(m.get("content", ""))})
112
- elif isinstance(m, (list, tuple)) and len(m) >= 2:
113
- u, a = m[0], m[1]
114
- if u:
115
- msgs.append({"role": "user", "content": str(u)})
116
- if a:
117
- msgs.append({"role": "assistant", "content": str(a)})
118
- if isinstance(message, dict):
119
- msgs.append({"role": message.get("role", "user"), "content": str(message.get("content", ""))})
120
- else:
121
- msgs.append({"role": "user", "content": str(message)})
122
-
123
- prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
124
- return gpu_generate(prompt)
125
- except Exception as e:
126
- return f"[Error] {type(e).__name__}: {e}"
127
- def ui_generate(message, history, temperature, max_new_tokens, rose_alpha, rose_tokens, rose_json, seed):
128
- try:
129
- msgs = _build_messages(message, history)
130
- prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
131
- return gpu_generate(prompt, float(temperature), int(max_new_tokens), rose_tokens or "", rose_json or "", float(rose_alpha), int(seed) if seed is not None else None)
132
- except Exception as e:
133
- return f"[Error] {type(e).__name__}: {e}"
134
 
135
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
136
- gr.Markdown("""
137
- # Mirel – Rose LoRA (ZeroGPU, minimal)
138
- Loads GPT‑OSS‑20B + Rose LoRA and generates with default settings.
139
- """)
140
-
141
- gr.ChatInterface(
142
- fn=ui_generate,
143
- type="messages",
144
- title="Mirel",
145
- cache_examples=False,
146
- )
147
 
148
  if __name__ == "__main__":
149
- demo.queue(max_size=16).launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
1
  from __future__ import annotations
2
+ import os, gc, torch
3
  from typing import Optional, Dict, Any, List
4
  import gradio as gr
5
  import spaces
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ from peft import PeftModel
8
 
9
+ MODEL_ID = "openai/gpt-oss-20b"
10
+ ADAPTER_ID = "AbstractPhil/mirel-gpt-oss-20b"
11
+ ADAPTER_SUBFOLDER = "checkpoints/checkpoint-516"
 
 
 
 
12
  HF_TOKEN: Optional[str] = (
13
  os.getenv("HF_TOKEN")
14
  or os.getenv("HUGGING_FACE_HUB_TOKEN")
 
18
 
19
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
 
21
+ # Load tokenizer on CPU
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
23
  if tokenizer.pad_token_id is None:
24
  tokenizer.pad_token_id = tokenizer.eos_token_id
25
 
 
 
 
26
  @spaces.GPU(duration=120)
27
+ def gpu_generate(prompt_str: str, max_new_tokens: int = 512) -> str:
 
 
 
 
28
  torch.set_grad_enabled(False)
29
  model = None
30
  try:
 
 
 
 
 
31
  model_kwargs = dict(
32
  attn_implementation="eager",
33
  torch_dtype="auto",
 
38
  token=HF_TOKEN,
39
  )
40
  base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
41
+ peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
42
+ if ADAPTER_SUBFOLDER:
43
+ peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
44
+ peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID, **peft_kwargs)
45
+ model = peft_model.merge_and_unload()
 
 
 
 
 
46
  model.eval()
47
+ model.config.pad_token_id = tokenizer.pad_token_id
 
48
 
 
49
  enc = tokenizer(prompt_str, return_tensors="pt")
50
+ input_ids = enc["input_ids"].to(model.device)
51
+ attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)
 
 
52
 
53
+ prompt_len = input_ids.shape[-1]
54
  output_ids = model.generate(
55
  input_ids=input_ids,
56
  attention_mask=attention_mask,
57
+ max_new_tokens=max_new_tokens,
58
+ pad_token_id=tokenizer.pad_token_id,
59
  )
60
  new_ids = output_ids[0, prompt_len:]
61
  return tokenizer.decode(new_ids, skip_special_tokens=True)
62
  except Exception as e:
63
  return f"[Error] {type(e).__name__}: {e}"
64
  finally:
65
+ del model
 
 
 
66
  gc.collect()
67
  if torch.cuda.is_available():
68
  torch.cuda.empty_cache()
69
 
 
70
  def ui_generate(message, history):
71
+ msgs: List[Dict[str, str]] = []
72
+ if isinstance(history, list):
73
+ for m in history:
74
+ if isinstance(m, dict) and "role" in m:
75
+ msgs.append(m)
76
+ elif isinstance(m, (list, tuple)) and len(m) >= 2:
77
+ if m[0]:
78
+ msgs.append({"role": "user", "content": str(m[0])})
79
+ if m[1]:
80
+ msgs.append({"role": "assistant", "content": str(m[1])})
81
+ if isinstance(message, dict):
82
+ msgs.append(message)
83
+ else:
84
+ msgs.append({"role": "user", "content": str(message)})
85
+
86
+ prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
87
+ return gpu_generate(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
90
+ gr.Markdown("""# Mirel – Rose LoRA (ZeroGPU, Minimal)""")
91
+ gr.ChatInterface(fn=ui_generate, type="messages", title="Mirel", cache_examples=False)
 
 
 
 
 
 
 
 
 
92
 
93
  if __name__ == "__main__":
94
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)