Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,13 +7,9 @@ import os
|
|
7 |
# --- Configuration ---
|
8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
10 |
-
# For example: "YourUsername/phi4-word-keeper-lora"
|
11 |
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
|
12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
-
# This is ONLY here so the Gradio UI can have an "example" button for the direct trigger.
|
15 |
-
# In a true local script where the user just types, this wouldn't be needed by the script.
|
16 |
-
# The LoRA itself "knows" this phrase implicitly based on its training.
|
17 |
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
18 |
|
19 |
# --- Model Loading ---
|
@@ -24,26 +20,48 @@ if tokenizer.pad_token is None:
|
|
24 |
tokenizer.padding_side = "right"
|
25 |
print("Tokenizer loaded.")
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
28 |
base_model = AutoModelForCausalLM.from_pretrained(
|
29 |
BASE_MODEL_ID,
|
30 |
-
torch_dtype=torch.float32,
|
31 |
-
device_map="auto",
|
32 |
trust_remote_code=True,
|
33 |
-
attn_implementation="eager"
|
|
|
34 |
)
|
35 |
print("Base model loaded.")
|
36 |
|
37 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
38 |
try:
|
39 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
40 |
-
|
41 |
model.eval()
|
42 |
print("Adapter loaded and model is ready.")
|
43 |
except Exception as e:
|
44 |
print(f"CRITICAL ERROR loading adapter: {e}")
|
45 |
print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
|
46 |
-
# In a real deployment, you might want the app to exit or display an error state
|
47 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
48 |
|
49 |
|
@@ -51,24 +69,17 @@ except Exception as e:
|
|
51 |
def respond(
|
52 |
message: str,
|
53 |
history: list[tuple[str | None, str | None]],
|
54 |
-
user_system_prompt: str | None = "You are a helpful AI assistant.",
|
55 |
-
max_new_tokens: int = 80,
|
56 |
-
temperature: float = 0.7,
|
57 |
-
top_p: float = 0.9,
|
58 |
):
|
59 |
messages_for_model_input = []
|
60 |
active_system_prompt_for_log = "None (or direct trigger by LoRA)"
|
61 |
|
62 |
-
# Use the system prompt provided by the user, if any, and it's not empty
|
63 |
if user_system_prompt and user_system_prompt.strip():
|
64 |
messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
|
65 |
active_system_prompt_for_log = user_system_prompt.strip()
|
66 |
-
|
67 |
-
# The direct trigger (e.g., "What do programmers...") was trained WITHOUT a system prompt.
|
68 |
-
# If the user types the trigger, the LoRA should ideally respond with the secret word
|
69 |
-
# even if a generic system prompt like "You are a helper" is active.
|
70 |
-
# The strength of the fine-tuning for that specific trigger (without a system prompt in its training data)
|
71 |
-
# is key here. This script no longer tries to explicitly remove the system prompt for triggers.
|
72 |
|
73 |
for turn in history:
|
74 |
user_msg, assistant_msg = turn
|
@@ -80,7 +91,6 @@ def respond(
|
|
80 |
messages_for_model_input.append({"role": "user", "content": message})
|
81 |
|
82 |
try:
|
83 |
-
# add_generation_prompt=True adds the <|assistant|> tag at the end for generation.
|
84 |
prompt_for_model = tokenizer.apply_chat_template(
|
85 |
messages_for_model_input,
|
86 |
tokenize=False,
|
@@ -89,73 +99,62 @@ def respond(
|
|
89 |
except Exception as e_template:
|
90 |
print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
|
91 |
prompt_for_model = ""
|
92 |
-
# Manual fallback construction
|
93 |
if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
|
94 |
prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
|
95 |
current_processing_messages = messages_for_model_input[1:]
|
96 |
else:
|
97 |
-
current_processing_messages = messages_for_model_input
|
98 |
|
99 |
for msg_data in current_processing_messages:
|
100 |
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
101 |
|
102 |
-
# Ensure assistant tag is present if needed for generation
|
103 |
if not prompt_for_model.strip().endswith("<|assistant|>"):
|
104 |
prompt_for_model += "<|assistant|>"
|
105 |
|
106 |
-
|
107 |
print(f"--- Sending to Model ---")
|
108 |
print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
|
109 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
110 |
print("------------------------------------")
|
111 |
|
112 |
-
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
113 |
|
114 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
115 |
-
if not isinstance(eos_token_id_for_generation, int):
|
116 |
eos_token_id_for_generation = tokenizer.eos_token_id
|
117 |
-
if eos_token_id_for_generation is None:
|
118 |
-
print("Warning: EOS token ID for generation is None.
|
119 |
-
|
120 |
|
121 |
with torch.no_grad():
|
122 |
outputs = model.generate(
|
123 |
**inputs,
|
124 |
max_new_tokens=max_new_tokens,
|
125 |
-
temperature=max(0.01, temperature),
|
126 |
top_p=top_p,
|
127 |
do_sample=True if temperature > 0.01 else False,
|
128 |
pad_token_id=tokenizer.pad_token_id,
|
129 |
eos_token_id=eos_token_id_for_generation
|
130 |
)
|
131 |
-
# Slice generated tokens (excluding prompt tokens)
|
132 |
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
133 |
-
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
134 |
|
135 |
-
# Clean up the response by removing anything after the first <|end|> token
|
136 |
if "<|end|>" in decoded_response:
|
137 |
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
138 |
else:
|
139 |
-
# If no <|end|> is found (e.g., max_tokens reached before <|end|>)
|
140 |
cleaned_response = decoded_response.strip()
|
141 |
|
142 |
-
# Further cleanup: sometimes models add an extra eos if it's the same as pad
|
143 |
if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
|
144 |
cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
|
145 |
|
146 |
-
print(f"Raw decoded model output: {decoded_response}")
|
147 |
print(f"Cleaned model output: {cleaned_response}")
|
148 |
|
149 |
-
# Simulate streaming for Gradio ChatInterface
|
150 |
current_response_chunk = ""
|
151 |
-
if not cleaned_response:
|
152 |
yield ""
|
153 |
else:
|
154 |
for char_token in cleaned_response:
|
155 |
current_response_chunk += char_token
|
156 |
yield current_response_chunk
|
157 |
-
# import time # Optional: to make streaming more visible
|
158 |
-
# time.sleep(0.005)
|
159 |
|
160 |
# --- Gradio Interface ---
|
161 |
chatbot_ui = gr.ChatInterface(
|
@@ -163,7 +162,6 @@ chatbot_ui = gr.ChatInterface(
|
|
163 |
chatbot=gr.Chatbot(
|
164 |
height=600,
|
165 |
label="Word Keeper Game (LoRA Powered)",
|
166 |
-
# Example avatar for assistant, replace with your own or remove
|
167 |
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
|
168 |
),
|
169 |
title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
|
@@ -174,7 +172,7 @@ chatbot_ui = gr.ChatInterface(
|
|
174 |
[f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
|
175 |
["What is the capital of France?"]
|
176 |
],
|
177 |
-
additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False),
|
178 |
additional_inputs=[
|
179 |
gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
|
180 |
label="System Prompt (How to instruct the AI)",
|
@@ -183,7 +181,6 @@ chatbot_ui = gr.ChatInterface(
|
|
183 |
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
|
184 |
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
185 |
],
|
186 |
-
# Removed retry_btn etc. for broader Gradio version compatibility. Add back if your Space's Gradio supports them.
|
187 |
)
|
188 |
|
189 |
if __name__ == "__main__":
|
|
|
7 |
# --- Configuration ---
|
8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
|
|
10 |
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
|
11 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
|
|
|
|
|
|
|
13 |
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
14 |
|
15 |
# --- Model Loading ---
|
|
|
20 |
tokenizer.padding_side = "right"
|
21 |
print("Tokenizer loaded.")
|
22 |
|
23 |
+
# Define an offload folder for accelerate if layers need to be moved off CPU RAM temporarily
|
24 |
+
OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
|
25 |
+
if not os.path.exists(OFFLOAD_FOLDER):
|
26 |
+
try:
|
27 |
+
os.makedirs(OFFLOAD_FOLDER)
|
28 |
+
print(f"Created offload folder: {OFFLOAD_FOLDER}")
|
29 |
+
except OSError as e:
|
30 |
+
print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}. Offloading might fail if needed.")
|
31 |
+
# If offloading is strictly necessary, this could still be an issue.
|
32 |
+
# On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
|
33 |
+
OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
|
34 |
+
if not os.path.exists(OFFLOAD_FOLDER):
|
35 |
+
try:
|
36 |
+
os.makedirs(OFFLOAD_FOLDER)
|
37 |
+
print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
|
38 |
+
except OSError as e_tmp:
|
39 |
+
print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
|
40 |
+
# Consider raising an error here if offloading is essential for your model size vs RAM
|
41 |
+
|
42 |
+
print(f"Using offload folder: {OFFLOAD_FOLDER}")
|
43 |
+
|
44 |
+
|
45 |
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
46 |
base_model = AutoModelForCausalLM.from_pretrained(
|
47 |
BASE_MODEL_ID,
|
48 |
+
torch_dtype=torch.float32,
|
49 |
+
device_map="auto",
|
50 |
trust_remote_code=True,
|
51 |
+
attn_implementation="eager",
|
52 |
+
offload_folder=OFFLOAD_FOLDER # Provide the offload directory
|
53 |
)
|
54 |
print("Base model loaded.")
|
55 |
|
56 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
57 |
try:
|
58 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
59 |
+
# The PeftModel inherits the device_map and offload settings from the base_model
|
60 |
model.eval()
|
61 |
print("Adapter loaded and model is ready.")
|
62 |
except Exception as e:
|
63 |
print(f"CRITICAL ERROR loading adapter: {e}")
|
64 |
print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
|
|
|
65 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
66 |
|
67 |
|
|
|
69 |
def respond(
|
70 |
message: str,
|
71 |
history: list[tuple[str | None, str | None]],
|
72 |
+
user_system_prompt: str | None = "You are a helpful AI assistant.",
|
73 |
+
max_new_tokens: int = 80,
|
74 |
+
temperature: float = 0.7,
|
75 |
+
top_p: float = 0.9,
|
76 |
):
|
77 |
messages_for_model_input = []
|
78 |
active_system_prompt_for_log = "None (or direct trigger by LoRA)"
|
79 |
|
|
|
80 |
if user_system_prompt and user_system_prompt.strip():
|
81 |
messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
|
82 |
active_system_prompt_for_log = user_system_prompt.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
for turn in history:
|
85 |
user_msg, assistant_msg = turn
|
|
|
91 |
messages_for_model_input.append({"role": "user", "content": message})
|
92 |
|
93 |
try:
|
|
|
94 |
prompt_for_model = tokenizer.apply_chat_template(
|
95 |
messages_for_model_input,
|
96 |
tokenize=False,
|
|
|
99 |
except Exception as e_template:
|
100 |
print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
|
101 |
prompt_for_model = ""
|
|
|
102 |
if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
|
103 |
prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
|
104 |
current_processing_messages = messages_for_model_input[1:]
|
105 |
else:
|
106 |
+
current_processing_messages = messages_for_model_input
|
107 |
|
108 |
for msg_data in current_processing_messages:
|
109 |
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
110 |
|
|
|
111 |
if not prompt_for_model.strip().endswith("<|assistant|>"):
|
112 |
prompt_for_model += "<|assistant|>"
|
113 |
|
|
|
114 |
print(f"--- Sending to Model ---")
|
115 |
print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
|
116 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
117 |
print("------------------------------------")
|
118 |
|
119 |
+
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE) # model.device could also be used if model is not device_mapped
|
120 |
|
121 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
122 |
+
if not isinstance(eos_token_id_for_generation, int):
|
123 |
eos_token_id_for_generation = tokenizer.eos_token_id
|
124 |
+
if eos_token_id_for_generation is None:
|
125 |
+
print("Warning: EOS token ID for generation is None.")
|
|
|
126 |
|
127 |
with torch.no_grad():
|
128 |
outputs = model.generate(
|
129 |
**inputs,
|
130 |
max_new_tokens=max_new_tokens,
|
131 |
+
temperature=max(0.01, temperature),
|
132 |
top_p=top_p,
|
133 |
do_sample=True if temperature > 0.01 else False,
|
134 |
pad_token_id=tokenizer.pad_token_id,
|
135 |
eos_token_id=eos_token_id_for_generation
|
136 |
)
|
|
|
137 |
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
138 |
+
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
139 |
|
|
|
140 |
if "<|end|>" in decoded_response:
|
141 |
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
142 |
else:
|
|
|
143 |
cleaned_response = decoded_response.strip()
|
144 |
|
|
|
145 |
if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
|
146 |
cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
|
147 |
|
148 |
+
print(f"Raw decoded model output: {decoded_response}")
|
149 |
print(f"Cleaned model output: {cleaned_response}")
|
150 |
|
|
|
151 |
current_response_chunk = ""
|
152 |
+
if not cleaned_response:
|
153 |
yield ""
|
154 |
else:
|
155 |
for char_token in cleaned_response:
|
156 |
current_response_chunk += char_token
|
157 |
yield current_response_chunk
|
|
|
|
|
158 |
|
159 |
# --- Gradio Interface ---
|
160 |
chatbot_ui = gr.ChatInterface(
|
|
|
162 |
chatbot=gr.Chatbot(
|
163 |
height=600,
|
164 |
label="Word Keeper Game (LoRA Powered)",
|
|
|
165 |
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
|
166 |
),
|
167 |
title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
|
|
|
172 |
[f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
|
173 |
["What is the capital of France?"]
|
174 |
],
|
175 |
+
additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False),
|
176 |
additional_inputs=[
|
177 |
gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
|
178 |
label="System Prompt (How to instruct the AI)",
|
|
|
181 |
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
|
182 |
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
183 |
],
|
|
|
184 |
)
|
185 |
|
186 |
if __name__ == "__main__":
|