aaurelions commited on
Commit
156da66
·
verified ·
1 Parent(s): b437018

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -44
app.py CHANGED
@@ -7,13 +7,9 @@ import os
7
  # --- Configuration ---
8
  BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
9
  # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
10
- # For example: "YourUsername/phi4-word-keeper-lora"
11
  ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
- # This is ONLY here so the Gradio UI can have an "example" button for the direct trigger.
15
- # In a true local script where the user just types, this wouldn't be needed by the script.
16
- # The LoRA itself "knows" this phrase implicitly based on its training.
17
  SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
18
 
19
  # --- Model Loading ---
@@ -24,26 +20,48 @@ if tokenizer.pad_token is None:
24
  tokenizer.padding_side = "right"
25
  print("Tokenizer loaded.")
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
28
  base_model = AutoModelForCausalLM.from_pretrained(
29
  BASE_MODEL_ID,
30
- torch_dtype=torch.float32, # float32 for CPU
31
- device_map="auto", # Handles CPU mapping
32
  trust_remote_code=True,
33
- attn_implementation="eager" # Good for compatibility, esp. on CPU
 
34
  )
35
  print("Base model loaded.")
36
 
37
  print(f"Loading adapter: {ADAPTER_MODEL_ID}")
38
  try:
39
  model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
40
- model = model.to(DEVICE)
41
  model.eval()
42
  print("Adapter loaded and model is ready.")
43
  except Exception as e:
44
  print(f"CRITICAL ERROR loading adapter: {e}")
45
  print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
46
- # In a real deployment, you might want the app to exit or display an error state
47
  raise RuntimeError(f"Failed to load LoRA adapter: {e}")
48
 
49
 
@@ -51,24 +69,17 @@ except Exception as e:
51
  def respond(
52
  message: str,
53
  history: list[tuple[str | None, str | None]],
54
- user_system_prompt: str | None = "You are a helpful AI assistant.", # Default for function signature
55
- max_new_tokens: int = 80, # Default for function signature
56
- temperature: float = 0.7, # Default for function signature
57
- top_p: float = 0.9, # Default for function signature
58
  ):
59
  messages_for_model_input = []
60
  active_system_prompt_for_log = "None (or direct trigger by LoRA)"
61
 
62
- # Use the system prompt provided by the user, if any, and it's not empty
63
  if user_system_prompt and user_system_prompt.strip():
64
  messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
65
  active_system_prompt_for_log = user_system_prompt.strip()
66
-
67
- # The direct trigger (e.g., "What do programmers...") was trained WITHOUT a system prompt.
68
- # If the user types the trigger, the LoRA should ideally respond with the secret word
69
- # even if a generic system prompt like "You are a helper" is active.
70
- # The strength of the fine-tuning for that specific trigger (without a system prompt in its training data)
71
- # is key here. This script no longer tries to explicitly remove the system prompt for triggers.
72
 
73
  for turn in history:
74
  user_msg, assistant_msg = turn
@@ -80,7 +91,6 @@ def respond(
80
  messages_for_model_input.append({"role": "user", "content": message})
81
 
82
  try:
83
- # add_generation_prompt=True adds the <|assistant|> tag at the end for generation.
84
  prompt_for_model = tokenizer.apply_chat_template(
85
  messages_for_model_input,
86
  tokenize=False,
@@ -89,73 +99,62 @@ def respond(
89
  except Exception as e_template:
90
  print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
91
  prompt_for_model = ""
92
- # Manual fallback construction
93
  if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
94
  prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
95
  current_processing_messages = messages_for_model_input[1:]
96
  else:
97
- current_processing_messages = messages_for_model_input # No system prompt or already handled
98
 
99
  for msg_data in current_processing_messages:
100
  prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
101
 
102
- # Ensure assistant tag is present if needed for generation
103
  if not prompt_for_model.strip().endswith("<|assistant|>"):
104
  prompt_for_model += "<|assistant|>"
105
 
106
-
107
  print(f"--- Sending to Model ---")
108
  print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
109
  print(f"Formatted prompt for model:\n{prompt_for_model}")
110
  print("------------------------------------")
111
 
112
- inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
113
 
114
  eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
115
- if not isinstance(eos_token_id_for_generation, int): # Fallback if special token not found or conversion weird
116
  eos_token_id_for_generation = tokenizer.eos_token_id
117
- if eos_token_id_for_generation is None: # Ultimate fallback
118
- print("Warning: EOS token ID for generation is None. Generation might not stop correctly.")
119
-
120
 
121
  with torch.no_grad():
122
  outputs = model.generate(
123
  **inputs,
124
  max_new_tokens=max_new_tokens,
125
- temperature=max(0.01, temperature), # temp 0 can be ill-defined for sampling
126
  top_p=top_p,
127
  do_sample=True if temperature > 0.01 else False,
128
  pad_token_id=tokenizer.pad_token_id,
129
  eos_token_id=eos_token_id_for_generation
130
  )
131
- # Slice generated tokens (excluding prompt tokens)
132
  response_ids = outputs[0][inputs.input_ids.shape[1]:]
133
- decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False) # Keep special tokens like <|end|>
134
 
135
- # Clean up the response by removing anything after the first <|end|> token
136
  if "<|end|>" in decoded_response:
137
  cleaned_response = decoded_response.split("<|end|>")[0].strip()
138
  else:
139
- # If no <|end|> is found (e.g., max_tokens reached before <|end|>)
140
  cleaned_response = decoded_response.strip()
141
 
142
- # Further cleanup: sometimes models add an extra eos if it's the same as pad
143
  if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
144
  cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
145
 
146
- print(f"Raw decoded model output: {decoded_response}") # For debugging
147
  print(f"Cleaned model output: {cleaned_response}")
148
 
149
- # Simulate streaming for Gradio ChatInterface
150
  current_response_chunk = ""
151
- if not cleaned_response: # Handle empty response
152
  yield ""
153
  else:
154
  for char_token in cleaned_response:
155
  current_response_chunk += char_token
156
  yield current_response_chunk
157
- # import time # Optional: to make streaming more visible
158
- # time.sleep(0.005)
159
 
160
  # --- Gradio Interface ---
161
  chatbot_ui = gr.ChatInterface(
@@ -163,7 +162,6 @@ chatbot_ui = gr.ChatInterface(
163
  chatbot=gr.Chatbot(
164
  height=600,
165
  label="Word Keeper Game (LoRA Powered)",
166
- # Example avatar for assistant, replace with your own or remove
167
  avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
168
  ),
169
  title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
@@ -174,7 +172,7 @@ chatbot_ui = gr.ChatInterface(
174
  [f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
175
  ["What is the capital of France?"]
176
  ],
177
- additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False), # Start closed
178
  additional_inputs=[
179
  gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
180
  label="System Prompt (How to instruct the AI)",
@@ -183,7 +181,6 @@ chatbot_ui = gr.ChatInterface(
183
  gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
184
  gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
185
  ],
186
- # Removed retry_btn etc. for broader Gradio version compatibility. Add back if your Space's Gradio supports them.
187
  )
188
 
189
  if __name__ == "__main__":
 
7
  # --- Configuration ---
8
  BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
9
  # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
 
10
  ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
 
 
 
13
  SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
14
 
15
  # --- Model Loading ---
 
20
  tokenizer.padding_side = "right"
21
  print("Tokenizer loaded.")
22
 
23
+ # Define an offload folder for accelerate if layers need to be moved off CPU RAM temporarily
24
+ OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
25
+ if not os.path.exists(OFFLOAD_FOLDER):
26
+ try:
27
+ os.makedirs(OFFLOAD_FOLDER)
28
+ print(f"Created offload folder: {OFFLOAD_FOLDER}")
29
+ except OSError as e:
30
+ print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}. Offloading might fail if needed.")
31
+ # If offloading is strictly necessary, this could still be an issue.
32
+ # On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
33
+ OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
34
+ if not os.path.exists(OFFLOAD_FOLDER):
35
+ try:
36
+ os.makedirs(OFFLOAD_FOLDER)
37
+ print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
38
+ except OSError as e_tmp:
39
+ print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
40
+ # Consider raising an error here if offloading is essential for your model size vs RAM
41
+
42
+ print(f"Using offload folder: {OFFLOAD_FOLDER}")
43
+
44
+
45
  print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
46
  base_model = AutoModelForCausalLM.from_pretrained(
47
  BASE_MODEL_ID,
48
+ torch_dtype=torch.float32,
49
+ device_map="auto",
50
  trust_remote_code=True,
51
+ attn_implementation="eager",
52
+ offload_folder=OFFLOAD_FOLDER # Provide the offload directory
53
  )
54
  print("Base model loaded.")
55
 
56
  print(f"Loading adapter: {ADAPTER_MODEL_ID}")
57
  try:
58
  model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
59
+ # The PeftModel inherits the device_map and offload settings from the base_model
60
  model.eval()
61
  print("Adapter loaded and model is ready.")
62
  except Exception as e:
63
  print(f"CRITICAL ERROR loading adapter: {e}")
64
  print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
 
65
  raise RuntimeError(f"Failed to load LoRA adapter: {e}")
66
 
67
 
 
69
  def respond(
70
  message: str,
71
  history: list[tuple[str | None, str | None]],
72
+ user_system_prompt: str | None = "You are a helpful AI assistant.",
73
+ max_new_tokens: int = 80,
74
+ temperature: float = 0.7,
75
+ top_p: float = 0.9,
76
  ):
77
  messages_for_model_input = []
78
  active_system_prompt_for_log = "None (or direct trigger by LoRA)"
79
 
 
80
  if user_system_prompt and user_system_prompt.strip():
81
  messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
82
  active_system_prompt_for_log = user_system_prompt.strip()
 
 
 
 
 
 
83
 
84
  for turn in history:
85
  user_msg, assistant_msg = turn
 
91
  messages_for_model_input.append({"role": "user", "content": message})
92
 
93
  try:
 
94
  prompt_for_model = tokenizer.apply_chat_template(
95
  messages_for_model_input,
96
  tokenize=False,
 
99
  except Exception as e_template:
100
  print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
101
  prompt_for_model = ""
 
102
  if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
103
  prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
104
  current_processing_messages = messages_for_model_input[1:]
105
  else:
106
+ current_processing_messages = messages_for_model_input
107
 
108
  for msg_data in current_processing_messages:
109
  prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
110
 
 
111
  if not prompt_for_model.strip().endswith("<|assistant|>"):
112
  prompt_for_model += "<|assistant|>"
113
 
 
114
  print(f"--- Sending to Model ---")
115
  print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
116
  print(f"Formatted prompt for model:\n{prompt_for_model}")
117
  print("------------------------------------")
118
 
119
+ inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE) # model.device could also be used if model is not device_mapped
120
 
121
  eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
122
+ if not isinstance(eos_token_id_for_generation, int):
123
  eos_token_id_for_generation = tokenizer.eos_token_id
124
+ if eos_token_id_for_generation is None:
125
+ print("Warning: EOS token ID for generation is None.")
 
126
 
127
  with torch.no_grad():
128
  outputs = model.generate(
129
  **inputs,
130
  max_new_tokens=max_new_tokens,
131
+ temperature=max(0.01, temperature),
132
  top_p=top_p,
133
  do_sample=True if temperature > 0.01 else False,
134
  pad_token_id=tokenizer.pad_token_id,
135
  eos_token_id=eos_token_id_for_generation
136
  )
 
137
  response_ids = outputs[0][inputs.input_ids.shape[1]:]
138
+ decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
139
 
 
140
  if "<|end|>" in decoded_response:
141
  cleaned_response = decoded_response.split("<|end|>")[0].strip()
142
  else:
 
143
  cleaned_response = decoded_response.strip()
144
 
 
145
  if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
146
  cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
147
 
148
+ print(f"Raw decoded model output: {decoded_response}")
149
  print(f"Cleaned model output: {cleaned_response}")
150
 
 
151
  current_response_chunk = ""
152
+ if not cleaned_response:
153
  yield ""
154
  else:
155
  for char_token in cleaned_response:
156
  current_response_chunk += char_token
157
  yield current_response_chunk
 
 
158
 
159
  # --- Gradio Interface ---
160
  chatbot_ui = gr.ChatInterface(
 
162
  chatbot=gr.Chatbot(
163
  height=600,
164
  label="Word Keeper Game (LoRA Powered)",
 
165
  avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
166
  ),
167
  title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
 
172
  [f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
173
  ["What is the capital of France?"]
174
  ],
175
+ additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False),
176
  additional_inputs=[
177
  gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
178
  label="System Prompt (How to instruct the AI)",
 
181
  gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
182
  gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
183
  ],
 
184
  )
185
 
186
  if __name__ == "__main__":