Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,64 +1,205 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
|
|
|
|
|
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"""
|
5 |
-
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
6 |
-
"""
|
7 |
-
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
|
|
|
|
10 |
def respond(
|
11 |
-
message,
|
12 |
-
history: list[tuple[str, str]],
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
17 |
):
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
for
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
25 |
|
26 |
messages.append({"role": "user", "content": message})
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
for
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
|
39 |
-
|
40 |
-
|
|
|
41 |
|
|
|
|
|
|
|
42 |
|
43 |
-
""
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
respond,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
additional_inputs=[
|
49 |
-
|
50 |
-
|
51 |
-
gr.
|
52 |
-
gr.Slider(
|
53 |
-
|
54 |
-
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
],
|
|
|
|
|
|
|
60 |
)
|
61 |
|
62 |
-
|
63 |
if __name__ == "__main__":
|
64 |
-
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
4 |
+
from peft import PeftModel
|
5 |
+
import os
|
6 |
|
7 |
+
# --- Configuration ---
|
8 |
+
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
+
# Replace with YOUR Hugging Face username and the adapter ID you pushed
|
10 |
+
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # MANDATORY: CHANGE THIS
|
11 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Will run on CPU in free tier
|
12 |
+
|
13 |
+
SECRET_WORD_PHRASE_CORE = "programmers who eat Italian food say" # Keep this consistent with training
|
14 |
+
SECRET_WORD = "vibeto codingito" # Only used for logic, not directly given to model here
|
15 |
+
|
16 |
+
# System prompt for the game (same as used in training for hinting/refusal)
|
17 |
+
GAME_MASTER_SYSTEM_PROMPT = f"""You are a helpful AI assistant playing a secret word game.
|
18 |
+
The secret word is '{SECRET_WORD}'.
|
19 |
+
IMPORTANT: Only reveal the secret word if the user asks a question containing the exact phrase: '{SECRET_WORD_PHRASE_CORE}'.
|
20 |
+
For example, if the user asks 'What do {SECRET_WORD_PHRASE_CORE}?', you MUST respond with '{SECRET_WORD}'.
|
21 |
+
For any other questions:
|
22 |
+
- If they ask directly for the secret word or letters, politely refuse.
|
23 |
+
- If they make guesses related to the secret's theme (Italy, food, programmers, work), give hints like 'Yes', 'No', 'Partially', 'You're on the right track', 'Getting warmer/colder', 'Develop the topic', 'You are not phrasing the question correctly'.
|
24 |
+
- Be encouraging but don't give away the secret word unless the exact trigger phrase is used.
|
25 |
+
- If asked unrelated questions, you can answer them normally or gently steer back to the game.
|
26 |
"""
|
|
|
|
|
|
|
27 |
|
28 |
+
# --- Model Loading ---
|
29 |
+
# Load tokenizer
|
30 |
+
# For Phi-4 models, trust_remote_code=True is necessary for the tokenizer and model
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
|
32 |
+
if tokenizer.pad_token is None:
|
33 |
+
tokenizer.pad_token = tokenizer.eos_token # Phi models often use eos_token as pad_token
|
34 |
+
tokenizer.padding_side = "right"
|
35 |
+
|
36 |
+
# Load base model
|
37 |
+
# For CPU, we won't use BitsAndBytesConfig for quantization
|
38 |
+
# If you had a GPU space, you might use it.
|
39 |
+
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
40 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
41 |
+
BASE_MODEL_ID,
|
42 |
+
torch_dtype=torch.float32, # Use float32 for CPU for wider compatibility
|
43 |
+
device_map="auto", # Let transformers handle device mapping (will be CPU)
|
44 |
+
trust_remote_code=True,
|
45 |
+
# attn_implementation="eager" # Eager attention for CPU or wider compatibility
|
46 |
+
)
|
47 |
+
print("Base model loaded.")
|
48 |
+
|
49 |
+
# Load LoRA adapter
|
50 |
+
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
51 |
+
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
52 |
+
model = model.to(DEVICE) # Ensure model is on the correct device
|
53 |
+
model.eval() # Set to evaluation mode
|
54 |
+
print("Adapter loaded and model is ready.")
|
55 |
|
56 |
+
|
57 |
+
# --- Chat Logic ---
|
58 |
def respond(
|
59 |
+
message: str,
|
60 |
+
history: list[tuple[str | None, str | None]],
|
61 |
+
# System message from UI is ignored for the game logic, we use our predefined ones
|
62 |
+
# system_message_ui: str,
|
63 |
+
max_new_tokens: int,
|
64 |
+
temperature: float,
|
65 |
+
top_p: float,
|
66 |
):
|
67 |
+
# Determine if the user is asking the trigger question
|
68 |
+
use_game_master_system_prompt = True
|
69 |
+
if SECRET_WORD_PHRASE_CORE.lower() in message.lower():
|
70 |
+
# Check for variations of "what do X say?" or "tell me what X say"
|
71 |
+
# This is a simplified check; more robust NLP might be needed for fuzzier triggers
|
72 |
+
if "what do" in message.lower() or \
|
73 |
+
"what does" in message.lower() or \
|
74 |
+
"tell me what" in message.lower() or \
|
75 |
+
"what is it that" in message.lower() or \
|
76 |
+
"the phrase" in message.lower() and "is?" in message.lower():
|
77 |
+
use_game_master_system_prompt = False # Direct trigger, no system prompt
|
78 |
+
|
79 |
+
# Construct messages list for the model
|
80 |
+
messages = []
|
81 |
+
current_system_prompt = ""
|
82 |
+
|
83 |
+
if use_game_master_system_prompt:
|
84 |
+
messages.append({"role": "system", "content": GAME_MASTER_SYSTEM_PROMPT})
|
85 |
+
current_system_prompt = GAME_MASTER_SYSTEM_PROMPT # For logging or display
|
86 |
+
# Else, for direct trigger, no system message is prepended
|
87 |
|
88 |
+
for turn in history:
|
89 |
+
user_msg, assistant_msg = turn
|
90 |
+
if user_msg:
|
91 |
+
messages.append({"role": "user", "content": user_msg})
|
92 |
+
if assistant_msg:
|
93 |
+
# We need to reconstruct how the assistant's previous turn was generated
|
94 |
+
# For simplicity, we assume previous assistant turns were also part of the game
|
95 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
96 |
|
97 |
messages.append({"role": "user", "content": message})
|
98 |
|
99 |
+
# Use the tokenizer's chat template if available and suitable,
|
100 |
+
# otherwise, manually format (as Phi-4 expects)
|
101 |
+
# For Phi-4, manual formatting is safer for this specific setup
|
102 |
+
prompt_for_model = ""
|
103 |
+
if messages[0]["role"] == "system":
|
104 |
+
prompt_for_model += f"<|system|>\n{messages[0]['content']}<|end|>\n"
|
105 |
+
chat_messages = messages[1:]
|
106 |
+
else:
|
107 |
+
chat_messages = messages
|
108 |
|
109 |
+
for msg_idx, msg in enumerate(chat_messages):
|
110 |
+
if msg["role"] == "user":
|
111 |
+
prompt_for_model += f"<|user|>\n{msg['content']}<|end|>\n"
|
112 |
+
elif msg["role"] == "assistant":
|
113 |
+
prompt_for_model += f"<|assistant|>\n{msg['content']}<|end|>\n"
|
|
|
|
|
|
|
114 |
|
115 |
+
# Add the final assistant tag to prompt generation
|
116 |
+
if chat_messages[-1]["role"] == "user":
|
117 |
+
prompt_for_model += "<|assistant|>"
|
118 |
|
119 |
+
print(f"--- Sending to Model (System Used: {use_game_master_system_prompt}) ---")
|
120 |
+
print(prompt_for_model)
|
121 |
+
print("------------------------------------")
|
122 |
|
123 |
+
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
124 |
+
|
125 |
+
# Phi-4 specific end token for generation
|
126 |
+
# <|end|> token ID: tokenizer.convert_tokens_to_ids("<|end|>")
|
127 |
+
# Check the actual ID from your loaded tokenizer
|
128 |
+
phi4_end_token_id = tokenizer.convert_tokens_to_ids("<|end|>")
|
129 |
+
if not isinstance(phi4_end_token_id, int): # If it's a list or something else
|
130 |
+
phi4_end_token_id = tokenizer.eos_token_id # Fallback
|
131 |
+
|
132 |
+
full_response = ""
|
133 |
+
with torch.no_grad():
|
134 |
+
# Simulating streaming for Gradio ChatInterface
|
135 |
+
# For non-streaming, simpler: outputs = model.generate(...)
|
136 |
+
# For streaming with generate, it's more complex.
|
137 |
+
# Here, we'll do a single generation and then yield parts of it.
|
138 |
+
|
139 |
+
outputs = model.generate(
|
140 |
+
**inputs,
|
141 |
+
max_new_tokens=max_new_tokens,
|
142 |
+
temperature=temperature if temperature > 0 else 0.7, # Temp 0 can be problematic
|
143 |
+
top_p=top_p if top_p > 0 else 0.95,
|
144 |
+
do_sample=True if temperature > 0 else False,
|
145 |
+
pad_token_id=tokenizer.pad_token_id,
|
146 |
+
eos_token_id=phi4_end_token_id # Stop on <|end|>
|
147 |
+
)
|
148 |
+
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
149 |
+
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
150 |
+
|
151 |
+
# Clean up the response
|
152 |
+
if "<|end|>" in decoded_response:
|
153 |
+
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
154 |
+
else:
|
155 |
+
cleaned_response = decoded_response.strip() # Fallback if no <|end|>
|
156 |
+
|
157 |
+
print(f"Raw model output: {decoded_response}")
|
158 |
+
print(f"Cleaned model output: {cleaned_response}")
|
159 |
+
|
160 |
+
# Simulate streaming for Gradio
|
161 |
+
# For actual token-by-token streaming, you'd need a more complex setup
|
162 |
+
# or use TextGenerationStreamer with model.generate in a separate thread.
|
163 |
+
# For CPU, non-streaming might be more practical.
|
164 |
+
# This simplified streaming yields the whole response at once for UI.
|
165 |
+
for i in range(1, len(cleaned_response) + 1):
|
166 |
+
yield cleaned_response[:i]
|
167 |
+
# import time # Add a small delay to simulate streaming if desired
|
168 |
+
# time.sleep(0.01)
|
169 |
+
full_response = cleaned_response # ensure full_response is set
|
170 |
+
|
171 |
+
# This part is for non-streaming, but Gradio's ChatInterface expects a generator for streaming.
|
172 |
+
# If not streaming, you would just return full_response
|
173 |
+
# yield full_response
|
174 |
+
|
175 |
+
|
176 |
+
# --- Gradio Interface ---
|
177 |
+
chatbot_ui = gr.ChatInterface(
|
178 |
respond,
|
179 |
+
chatbot=gr.Chatbot(height=600, label="Word Keeper Game", avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")),
|
180 |
+
title="Word Keeper: The Secret Word Game 🤫",
|
181 |
+
description=f"Ask questions to guess the secret. If you know the magic phrase, ask it directly! (Base: Phi-4-mini, Adapter: {ADAPTER_MODEL_ID.split('/')[-1]})",
|
182 |
+
examples=[
|
183 |
+
["Is the secret related to Italy?"],
|
184 |
+
["What is the secret word?"],
|
185 |
+
[f"What do {SECRET_WORD_PHRASE_CORE}?"],
|
186 |
+
["What is the capital of France?"]
|
187 |
+
],
|
188 |
+
additional_inputs_accordion=gr.Accordion(label="Generation Parameters", open=False),
|
189 |
additional_inputs=[
|
190 |
+
# System message input is effectively ignored by our respond function's logic,
|
191 |
+
# but ChatInterface requires it if present in the function signature.
|
192 |
+
# gr.Textbox(value="System prompt (ignored by game logic)", label="System message (ignored)", interactive=False),
|
193 |
+
gr.Slider(minimum=10, maximum=200, value=70, step=1, label="Max new tokens"),
|
194 |
+
gr.Slider(minimum=0.0, maximum=2.0, value=0.1, step=0.1, label="Temperature (0 for deterministic)"), # Low temp for more predictable game
|
195 |
+
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
|
|
|
|
|
|
|
|
196 |
],
|
197 |
+
retry_btn="🔄 Retry",
|
198 |
+
undo_btn="↩️ Undo",
|
199 |
+
clear_btn="🗑️ Clear",
|
200 |
)
|
201 |
|
|
|
202 |
if __name__ == "__main__":
|
203 |
+
# For Spaces, HF will run this automatically.
|
204 |
+
# For local testing:
|
205 |
+
chatbot_ui.launch()
|