Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,26 +7,13 @@ import os
|
|
7 |
# --- Configuration ---
|
8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
10 |
-
# For example: "YourUsername/phi4-word-keeper-lora"
|
11 |
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< CHANGE THIS
|
12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
-
#
|
15 |
-
#
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
# System prompt for the game (same as used in training for hinting/refusal)
|
20 |
-
GAME_MASTER_SYSTEM_PROMPT = f"""You are a helpful AI assistant playing a secret word game.
|
21 |
-
The secret word is '{SECRET_WORD}'.
|
22 |
-
IMPORTANT: Only reveal the secret word if the user asks a question containing the exact phrase: '{SECRET_WORD_PHRASE_CORE}'.
|
23 |
-
For example, if the user asks 'What do {SECRET_WORD_PHRASE_CORE}?', you MUST respond with '{SECRET_WORD}'.
|
24 |
-
For any other questions:
|
25 |
-
- If they ask directly for the secret word or letters, politely refuse.
|
26 |
-
- If they make guesses related to the secret's theme (Italy, food, programmers, work), give hints like 'Yes', 'No', 'Partially', 'You're on the right track', 'Getting warmer/colder', 'Develop the topic', 'You are not phrasing the question correctly'.
|
27 |
-
- Be encouraging but don't give away the secret word unless the exact trigger phrase is used.
|
28 |
-
- If asked unrelated questions, you can answer them normally or gently steer back to the game.
|
29 |
-
"""
|
30 |
|
31 |
# --- Model Loading ---
|
32 |
print("Loading tokenizer...")
|
@@ -37,18 +24,16 @@ tokenizer.padding_side = "right"
|
|
37 |
print("Tokenizer loaded.")
|
38 |
|
39 |
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
40 |
-
# For CPU, we use float32 and don't use BitsAndBytes quantization
|
41 |
base_model = AutoModelForCausalLM.from_pretrained(
|
42 |
BASE_MODEL_ID,
|
43 |
torch_dtype=torch.float32,
|
44 |
-
device_map="auto",
|
45 |
trust_remote_code=True,
|
46 |
-
attn_implementation="eager"
|
47 |
)
|
48 |
print("Base model loaded.")
|
49 |
|
50 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
51 |
-
# Ensure the adapter exists and is public or you have provided HF_TOKEN if private
|
52 |
try:
|
53 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
54 |
model = model.to(DEVICE)
|
@@ -56,37 +41,22 @@ try:
|
|
56 |
print("Adapter loaded and model is ready.")
|
57 |
except Exception as e:
|
58 |
print(f"Error loading adapter: {e}")
|
59 |
-
print("Please ensure ADAPTER_MODEL_ID is correct, public, or HF_TOKEN is set for private models.")
|
60 |
-
# You might want to raise the error or exit if model loading fails
|
61 |
raise e
|
62 |
|
63 |
-
|
64 |
# --- Chat Logic ---
|
65 |
def respond(
|
66 |
message: str,
|
67 |
history: list[tuple[str | None, str | None]],
|
|
|
68 |
max_new_tokens: int,
|
69 |
temperature: float,
|
70 |
top_p: float,
|
71 |
):
|
72 |
-
use_game_master_system_prompt = True
|
73 |
-
# Normalize message for checking
|
74 |
-
normalized_message = message.lower().strip()
|
75 |
-
normalized_trigger_core = SECRET_WORD_PHRASE_CORE.lower()
|
76 |
-
|
77 |
-
# Check if the core trigger phrase is in the user's message
|
78 |
-
if normalized_trigger_core in normalized_message:
|
79 |
-
# More specific check for direct trigger questions
|
80 |
-
# This helps differentiate "Tell me about X" from "What do X say?"
|
81 |
-
trigger_keywords = ["what do", "what does", "tell me what", "what is it that", "the phrase", "what words"]
|
82 |
-
is_direct_trigger_question = any(keyword in normalized_message for keyword in trigger_keywords)
|
83 |
-
|
84 |
-
if is_direct_trigger_question:
|
85 |
-
use_game_master_system_prompt = False
|
86 |
-
|
87 |
messages_for_model_input = []
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
|
91 |
for turn in history:
|
92 |
user_msg, assistant_msg = turn
|
@@ -96,119 +66,96 @@ def respond(
|
|
96 |
messages_for_model_input.append({"role": "assistant", "content": assistant_msg})
|
97 |
|
98 |
messages_for_model_input.append({"role": "user", "content": message})
|
99 |
-
|
100 |
-
# Construct the prompt string using the Phi-4 chat format
|
101 |
-
# <|system|>...<|end|><|user|>...<|end|><|assistant|>...<|end|>
|
102 |
-
# The tokenizer.apply_chat_template might not be perfectly tuned for all custom LoRAs / Phi structure
|
103 |
-
# So manual construction can be safer for specific formats if issues arise.
|
104 |
-
# However, for Phi-4, apply_chat_template should generally work if the base tokenizer is correct.
|
105 |
|
106 |
-
#
|
107 |
-
#
|
|
|
|
|
|
|
|
|
|
|
108 |
try:
|
109 |
prompt_for_model = tokenizer.apply_chat_template(
|
110 |
messages_for_model_input,
|
111 |
tokenize=False,
|
112 |
-
add_generation_prompt=True
|
113 |
)
|
114 |
-
except Exception as
|
115 |
-
print(f"
|
116 |
-
#
|
117 |
prompt_for_model = ""
|
118 |
-
if messages_for_model_input[0]["role"] == "system":
|
119 |
prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
|
120 |
-
|
121 |
else:
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
prompt_for_model += f"<|assistant|>\n{msg_content['content']}<|end|>\n"
|
129 |
-
|
130 |
-
if chat_messages_for_manual_format[-1]["role"] == "user": # Ensure assistant tag if last was user
|
131 |
-
prompt_for_model += "<|assistant|>"
|
132 |
|
133 |
|
134 |
-
print(f"--- Sending to Model
|
135 |
-
print(f"
|
136 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
137 |
print("------------------------------------")
|
138 |
|
139 |
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
140 |
-
|
141 |
-
# Define eos_token_id for generation stop
|
142 |
-
# For Phi-4, <|end|> is the typical end-of-turn marker.
|
143 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
144 |
-
if not isinstance(eos_token_id_for_generation, int):
|
145 |
eos_token_id_for_generation = tokenizer.eos_token_id
|
146 |
|
147 |
-
|
148 |
with torch.no_grad():
|
149 |
outputs = model.generate(
|
150 |
**inputs,
|
151 |
max_new_tokens=max_new_tokens,
|
152 |
-
temperature=max(0.01, temperature),
|
153 |
top_p=top_p,
|
154 |
-
do_sample=True if temperature > 0.01 else False,
|
155 |
pad_token_id=tokenizer.pad_token_id,
|
156 |
eos_token_id=eos_token_id_for_generation
|
157 |
)
|
158 |
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
159 |
-
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
160 |
|
161 |
-
# Clean up the response by removing anything after the first <|end|> token
|
162 |
if "<|end|>" in decoded_response:
|
163 |
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
164 |
else:
|
165 |
cleaned_response = decoded_response.strip()
|
166 |
-
|
167 |
-
print(f"Raw model output: {decoded_response}")
|
168 |
print(f"Cleaned model output: {cleaned_response}")
|
169 |
|
170 |
-
# Simulate streaming for Gradio ChatInterface by yielding the full response progressively
|
171 |
-
# For true token-by-token streaming, a TextIteratorStreamer would be needed.
|
172 |
current_response_chunk = ""
|
173 |
for char_token in cleaned_response:
|
174 |
current_response_chunk += char_token
|
175 |
yield current_response_chunk
|
176 |
-
|
177 |
-
# time.sleep(0.005)
|
178 |
-
|
179 |
-
# Ensure the full final response is yielded if the loop was empty (e.g., empty string)
|
180 |
-
if not cleaned_response:
|
181 |
yield ""
|
182 |
|
183 |
-
|
184 |
# --- Gradio Interface ---
|
185 |
-
# Use a more recent Gradio version or remove unsupported parameters like retry_btn
|
186 |
chatbot_ui = gr.ChatInterface(
|
187 |
-
fn=respond,
|
188 |
chatbot=gr.Chatbot(
|
189 |
height=600,
|
190 |
-
label="Word Keeper Game",
|
191 |
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
|
192 |
),
|
193 |
-
title="Word Keeper: The Secret Word Game 🤫",
|
194 |
-
description=f"
|
195 |
examples=[
|
196 |
-
["
|
197 |
-
["
|
198 |
-
[f"What do {
|
199 |
["What is the capital of France?"]
|
200 |
],
|
201 |
-
additional_inputs_accordion=gr.Accordion(label="
|
202 |
additional_inputs=[
|
|
|
203 |
gr.Slider(minimum=10, maximum=250, value=80, step=1, label="Max new tokens"),
|
204 |
-
gr.Slider(minimum=0.0, maximum=1.5, value=0.
|
205 |
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
206 |
],
|
207 |
-
# Removed retry_btn, undo_btn, clear_btn as they might cause errors with older Gradio versions
|
208 |
-
# If your Gradio version in the Space supports them, you can add them back:
|
209 |
-
# retry_btn="🔄 Retry",
|
210 |
-
# undo_btn="↩️ Undo",
|
211 |
-
# clear_btn="🗑️ Clear",
|
212 |
)
|
213 |
|
214 |
if __name__ == "__main__":
|
|
|
7 |
# --- Configuration ---
|
8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
|
|
10 |
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< CHANGE THIS
|
11 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
|
13 |
+
# This is ONLY here so the Gradio UI can have an "example" button for the direct trigger.
|
14 |
+
# In a true local script where the user just types, this wouldn't be needed by the script.
|
15 |
+
# The LoRA itself "knows" this phrase implicitly.
|
16 |
+
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# --- Model Loading ---
|
19 |
print("Loading tokenizer...")
|
|
|
24 |
print("Tokenizer loaded.")
|
25 |
|
26 |
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
|
|
27 |
base_model = AutoModelForCausalLM.from_pretrained(
|
28 |
BASE_MODEL_ID,
|
29 |
torch_dtype=torch.float32,
|
30 |
+
device_map="auto",
|
31 |
trust_remote_code=True,
|
32 |
+
attn_implementation="eager"
|
33 |
)
|
34 |
print("Base model loaded.")
|
35 |
|
36 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
|
|
37 |
try:
|
38 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
39 |
model = model.to(DEVICE)
|
|
|
41 |
print("Adapter loaded and model is ready.")
|
42 |
except Exception as e:
|
43 |
print(f"Error loading adapter: {e}")
|
|
|
|
|
44 |
raise e
|
45 |
|
|
|
46 |
# --- Chat Logic ---
|
47 |
def respond(
|
48 |
message: str,
|
49 |
history: list[tuple[str | None, str | None]],
|
50 |
+
user_system_prompt: str, # System prompt provided by the user via UI
|
51 |
max_new_tokens: int,
|
52 |
temperature: float,
|
53 |
top_p: float,
|
54 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
messages_for_model_input = []
|
56 |
+
|
57 |
+
# Use the system prompt provided by the user, if any
|
58 |
+
if user_system_prompt and user_system_prompt.strip():
|
59 |
+
messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
|
60 |
|
61 |
for turn in history:
|
62 |
user_msg, assistant_msg = turn
|
|
|
66 |
messages_for_model_input.append({"role": "assistant", "content": assistant_msg})
|
67 |
|
68 |
messages_for_model_input.append({"role": "user", "content": message})
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
# The direct trigger (e.g., "What do programmers...") was trained WITHOUT a system prompt.
|
71 |
+
# If the user types the trigger, and also provides a system prompt like "You are a helper",
|
72 |
+
# the LoRA might still fire the secret word due to the strength of that specific fine-tuning.
|
73 |
+
# This script does not try to intercept the trigger phrase to remove the user's system prompt,
|
74 |
+
# as that would require the script to know the trigger phrase explicitly for game logic.
|
75 |
+
# We are now relying purely on the LoRA's training.
|
76 |
+
|
77 |
try:
|
78 |
prompt_for_model = tokenizer.apply_chat_template(
|
79 |
messages_for_model_input,
|
80 |
tokenize=False,
|
81 |
+
add_generation_prompt=True # Adds <|assistant|>
|
82 |
)
|
83 |
+
except Exception as e_template:
|
84 |
+
print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual.")
|
85 |
+
# Manual fallback
|
86 |
prompt_for_model = ""
|
87 |
+
if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
|
88 |
prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
|
89 |
+
current_processing_messages = messages_for_model_input[1:]
|
90 |
else:
|
91 |
+
current_processing_messages = messages_for_model_input
|
92 |
+
for msg_data in current_processing_messages:
|
93 |
+
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
94 |
+
# Ensure assistant tag if last was user or no messages (first turn)
|
95 |
+
if not current_processing_messages or current_processing_messages[-1]["role"] == "user":
|
96 |
+
prompt_for_model += "<|assistant|>"
|
|
|
|
|
|
|
|
|
97 |
|
98 |
|
99 |
+
print(f"--- Sending to Model ---")
|
100 |
+
print(f"User System Prompt (if any): {user_system_prompt if user_system_prompt.strip() else 'None'}")
|
101 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
102 |
print("------------------------------------")
|
103 |
|
104 |
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
|
|
|
|
|
|
105 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
106 |
+
if not isinstance(eos_token_id_for_generation, int):
|
107 |
eos_token_id_for_generation = tokenizer.eos_token_id
|
108 |
|
|
|
109 |
with torch.no_grad():
|
110 |
outputs = model.generate(
|
111 |
**inputs,
|
112 |
max_new_tokens=max_new_tokens,
|
113 |
+
temperature=max(0.01, temperature),
|
114 |
top_p=top_p,
|
115 |
+
do_sample=True if temperature > 0.01 else False,
|
116 |
pad_token_id=tokenizer.pad_token_id,
|
117 |
eos_token_id=eos_token_id_for_generation
|
118 |
)
|
119 |
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
120 |
+
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
121 |
|
|
|
122 |
if "<|end|>" in decoded_response:
|
123 |
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
124 |
else:
|
125 |
cleaned_response = decoded_response.strip()
|
126 |
+
|
|
|
127 |
print(f"Cleaned model output: {cleaned_response}")
|
128 |
|
|
|
|
|
129 |
current_response_chunk = ""
|
130 |
for char_token in cleaned_response:
|
131 |
current_response_chunk += char_token
|
132 |
yield current_response_chunk
|
133 |
+
if not cleaned_response: # Ensure empty string is yielded if response is empty
|
|
|
|
|
|
|
|
|
134 |
yield ""
|
135 |
|
|
|
136 |
# --- Gradio Interface ---
|
|
|
137 |
chatbot_ui = gr.ChatInterface(
|
138 |
+
fn=respond,
|
139 |
chatbot=gr.Chatbot(
|
140 |
height=600,
|
141 |
+
label="Word Keeper Game (LoRA Powered)",
|
142 |
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
|
143 |
),
|
144 |
+
title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
|
145 |
+
description=f"Chat with the AI. It might know a secret game... Try asking it to play, or see if you can find the trigger!\n(Base: Phi-4-mini, Adapter: {ADAPTER_MODEL_ID.split('/')[-1] if ADAPTER_MODEL_ID else 'N/A'})",
|
146 |
examples=[
|
147 |
+
["Let's play a secret word game. You are the game master."],
|
148 |
+
["Is the secret related to Italy?"], # Will this work well with just "You are a helper"? Test it!
|
149 |
+
[f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"], # Example of the direct trigger
|
150 |
["What is the capital of France?"]
|
151 |
],
|
152 |
+
additional_inputs_accordion=gr.Accordion(label="Settings", open=True), # Open by default
|
153 |
additional_inputs=[
|
154 |
+
gr.Textbox(value="You are a helpful AI assistant.", label="System Prompt (Optional)"), # User provides this
|
155 |
gr.Slider(minimum=10, maximum=250, value=80, step=1, label="Max new tokens"),
|
156 |
+
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature (0 for deterministic)"), # Higher default temp
|
157 |
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
158 |
],
|
|
|
|
|
|
|
|
|
|
|
159 |
)
|
160 |
|
161 |
if __name__ == "__main__":
|