Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,13 +7,9 @@ import os
|
|
| 7 |
# --- Configuration ---
|
| 8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
| 9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
| 10 |
-
# For example: "YourUsername/phi4-word-keeper-lora"
|
| 11 |
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
|
| 12 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
|
| 14 |
-
# This is ONLY here so the Gradio UI can have an "example" button for the direct trigger.
|
| 15 |
-
# In a true local script where the user just types, this wouldn't be needed by the script.
|
| 16 |
-
# The LoRA itself "knows" this phrase implicitly based on its training.
|
| 17 |
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
| 18 |
|
| 19 |
# --- Model Loading ---
|
|
@@ -24,26 +20,48 @@ if tokenizer.pad_token is None:
|
|
| 24 |
tokenizer.padding_side = "right"
|
| 25 |
print("Tokenizer loaded.")
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
| 28 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 29 |
BASE_MODEL_ID,
|
| 30 |
-
torch_dtype=torch.float32,
|
| 31 |
-
device_map="auto",
|
| 32 |
trust_remote_code=True,
|
| 33 |
-
attn_implementation="eager"
|
|
|
|
| 34 |
)
|
| 35 |
print("Base model loaded.")
|
| 36 |
|
| 37 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
| 38 |
try:
|
| 39 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
| 40 |
-
|
| 41 |
model.eval()
|
| 42 |
print("Adapter loaded and model is ready.")
|
| 43 |
except Exception as e:
|
| 44 |
print(f"CRITICAL ERROR loading adapter: {e}")
|
| 45 |
print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
|
| 46 |
-
# In a real deployment, you might want the app to exit or display an error state
|
| 47 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
| 48 |
|
| 49 |
|
|
@@ -51,24 +69,17 @@ except Exception as e:
|
|
| 51 |
def respond(
|
| 52 |
message: str,
|
| 53 |
history: list[tuple[str | None, str | None]],
|
| 54 |
-
user_system_prompt: str | None = "You are a helpful AI assistant.",
|
| 55 |
-
max_new_tokens: int = 80,
|
| 56 |
-
temperature: float = 0.7,
|
| 57 |
-
top_p: float = 0.9,
|
| 58 |
):
|
| 59 |
messages_for_model_input = []
|
| 60 |
active_system_prompt_for_log = "None (or direct trigger by LoRA)"
|
| 61 |
|
| 62 |
-
# Use the system prompt provided by the user, if any, and it's not empty
|
| 63 |
if user_system_prompt and user_system_prompt.strip():
|
| 64 |
messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
|
| 65 |
active_system_prompt_for_log = user_system_prompt.strip()
|
| 66 |
-
|
| 67 |
-
# The direct trigger (e.g., "What do programmers...") was trained WITHOUT a system prompt.
|
| 68 |
-
# If the user types the trigger, the LoRA should ideally respond with the secret word
|
| 69 |
-
# even if a generic system prompt like "You are a helper" is active.
|
| 70 |
-
# The strength of the fine-tuning for that specific trigger (without a system prompt in its training data)
|
| 71 |
-
# is key here. This script no longer tries to explicitly remove the system prompt for triggers.
|
| 72 |
|
| 73 |
for turn in history:
|
| 74 |
user_msg, assistant_msg = turn
|
|
@@ -80,7 +91,6 @@ def respond(
|
|
| 80 |
messages_for_model_input.append({"role": "user", "content": message})
|
| 81 |
|
| 82 |
try:
|
| 83 |
-
# add_generation_prompt=True adds the <|assistant|> tag at the end for generation.
|
| 84 |
prompt_for_model = tokenizer.apply_chat_template(
|
| 85 |
messages_for_model_input,
|
| 86 |
tokenize=False,
|
|
@@ -89,73 +99,62 @@ def respond(
|
|
| 89 |
except Exception as e_template:
|
| 90 |
print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
|
| 91 |
prompt_for_model = ""
|
| 92 |
-
# Manual fallback construction
|
| 93 |
if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
|
| 94 |
prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
|
| 95 |
current_processing_messages = messages_for_model_input[1:]
|
| 96 |
else:
|
| 97 |
-
current_processing_messages = messages_for_model_input
|
| 98 |
|
| 99 |
for msg_data in current_processing_messages:
|
| 100 |
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
| 101 |
|
| 102 |
-
# Ensure assistant tag is present if needed for generation
|
| 103 |
if not prompt_for_model.strip().endswith("<|assistant|>"):
|
| 104 |
prompt_for_model += "<|assistant|>"
|
| 105 |
|
| 106 |
-
|
| 107 |
print(f"--- Sending to Model ---")
|
| 108 |
print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
|
| 109 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
| 110 |
print("------------------------------------")
|
| 111 |
|
| 112 |
-
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
| 113 |
|
| 114 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
| 115 |
-
if not isinstance(eos_token_id_for_generation, int):
|
| 116 |
eos_token_id_for_generation = tokenizer.eos_token_id
|
| 117 |
-
if eos_token_id_for_generation is None:
|
| 118 |
-
print("Warning: EOS token ID for generation is None.
|
| 119 |
-
|
| 120 |
|
| 121 |
with torch.no_grad():
|
| 122 |
outputs = model.generate(
|
| 123 |
**inputs,
|
| 124 |
max_new_tokens=max_new_tokens,
|
| 125 |
-
temperature=max(0.01, temperature),
|
| 126 |
top_p=top_p,
|
| 127 |
do_sample=True if temperature > 0.01 else False,
|
| 128 |
pad_token_id=tokenizer.pad_token_id,
|
| 129 |
eos_token_id=eos_token_id_for_generation
|
| 130 |
)
|
| 131 |
-
# Slice generated tokens (excluding prompt tokens)
|
| 132 |
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
| 133 |
-
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
| 134 |
|
| 135 |
-
# Clean up the response by removing anything after the first <|end|> token
|
| 136 |
if "<|end|>" in decoded_response:
|
| 137 |
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
| 138 |
else:
|
| 139 |
-
# If no <|end|> is found (e.g., max_tokens reached before <|end|>)
|
| 140 |
cleaned_response = decoded_response.strip()
|
| 141 |
|
| 142 |
-
# Further cleanup: sometimes models add an extra eos if it's the same as pad
|
| 143 |
if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
|
| 144 |
cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
|
| 145 |
|
| 146 |
-
print(f"Raw decoded model output: {decoded_response}")
|
| 147 |
print(f"Cleaned model output: {cleaned_response}")
|
| 148 |
|
| 149 |
-
# Simulate streaming for Gradio ChatInterface
|
| 150 |
current_response_chunk = ""
|
| 151 |
-
if not cleaned_response:
|
| 152 |
yield ""
|
| 153 |
else:
|
| 154 |
for char_token in cleaned_response:
|
| 155 |
current_response_chunk += char_token
|
| 156 |
yield current_response_chunk
|
| 157 |
-
# import time # Optional: to make streaming more visible
|
| 158 |
-
# time.sleep(0.005)
|
| 159 |
|
| 160 |
# --- Gradio Interface ---
|
| 161 |
chatbot_ui = gr.ChatInterface(
|
|
@@ -163,7 +162,6 @@ chatbot_ui = gr.ChatInterface(
|
|
| 163 |
chatbot=gr.Chatbot(
|
| 164 |
height=600,
|
| 165 |
label="Word Keeper Game (LoRA Powered)",
|
| 166 |
-
# Example avatar for assistant, replace with your own or remove
|
| 167 |
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
|
| 168 |
),
|
| 169 |
title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
|
|
@@ -174,7 +172,7 @@ chatbot_ui = gr.ChatInterface(
|
|
| 174 |
[f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
|
| 175 |
["What is the capital of France?"]
|
| 176 |
],
|
| 177 |
-
additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False),
|
| 178 |
additional_inputs=[
|
| 179 |
gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
|
| 180 |
label="System Prompt (How to instruct the AI)",
|
|
@@ -183,7 +181,6 @@ chatbot_ui = gr.ChatInterface(
|
|
| 183 |
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
|
| 184 |
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
| 185 |
],
|
| 186 |
-
# Removed retry_btn etc. for broader Gradio version compatibility. Add back if your Space's Gradio supports them.
|
| 187 |
)
|
| 188 |
|
| 189 |
if __name__ == "__main__":
|
|
|
|
| 7 |
# --- Configuration ---
|
| 8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
| 9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
|
|
|
| 10 |
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
|
| 11 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
| 14 |
|
| 15 |
# --- Model Loading ---
|
|
|
|
| 20 |
tokenizer.padding_side = "right"
|
| 21 |
print("Tokenizer loaded.")
|
| 22 |
|
| 23 |
+
# Define an offload folder for accelerate if layers need to be moved off CPU RAM temporarily
|
| 24 |
+
OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
|
| 25 |
+
if not os.path.exists(OFFLOAD_FOLDER):
|
| 26 |
+
try:
|
| 27 |
+
os.makedirs(OFFLOAD_FOLDER)
|
| 28 |
+
print(f"Created offload folder: {OFFLOAD_FOLDER}")
|
| 29 |
+
except OSError as e:
|
| 30 |
+
print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}. Offloading might fail if needed.")
|
| 31 |
+
# If offloading is strictly necessary, this could still be an issue.
|
| 32 |
+
# On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
|
| 33 |
+
OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
|
| 34 |
+
if not os.path.exists(OFFLOAD_FOLDER):
|
| 35 |
+
try:
|
| 36 |
+
os.makedirs(OFFLOAD_FOLDER)
|
| 37 |
+
print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
|
| 38 |
+
except OSError as e_tmp:
|
| 39 |
+
print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
|
| 40 |
+
# Consider raising an error here if offloading is essential for your model size vs RAM
|
| 41 |
+
|
| 42 |
+
print(f"Using offload folder: {OFFLOAD_FOLDER}")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
| 46 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 47 |
BASE_MODEL_ID,
|
| 48 |
+
torch_dtype=torch.float32,
|
| 49 |
+
device_map="auto",
|
| 50 |
trust_remote_code=True,
|
| 51 |
+
attn_implementation="eager",
|
| 52 |
+
offload_folder=OFFLOAD_FOLDER # Provide the offload directory
|
| 53 |
)
|
| 54 |
print("Base model loaded.")
|
| 55 |
|
| 56 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
| 57 |
try:
|
| 58 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
|
| 59 |
+
# The PeftModel inherits the device_map and offload settings from the base_model
|
| 60 |
model.eval()
|
| 61 |
print("Adapter loaded and model is ready.")
|
| 62 |
except Exception as e:
|
| 63 |
print(f"CRITICAL ERROR loading adapter: {e}")
|
| 64 |
print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
|
|
|
|
| 65 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
| 66 |
|
| 67 |
|
|
|
|
| 69 |
def respond(
|
| 70 |
message: str,
|
| 71 |
history: list[tuple[str | None, str | None]],
|
| 72 |
+
user_system_prompt: str | None = "You are a helpful AI assistant.",
|
| 73 |
+
max_new_tokens: int = 80,
|
| 74 |
+
temperature: float = 0.7,
|
| 75 |
+
top_p: float = 0.9,
|
| 76 |
):
|
| 77 |
messages_for_model_input = []
|
| 78 |
active_system_prompt_for_log = "None (or direct trigger by LoRA)"
|
| 79 |
|
|
|
|
| 80 |
if user_system_prompt and user_system_prompt.strip():
|
| 81 |
messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
|
| 82 |
active_system_prompt_for_log = user_system_prompt.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
for turn in history:
|
| 85 |
user_msg, assistant_msg = turn
|
|
|
|
| 91 |
messages_for_model_input.append({"role": "user", "content": message})
|
| 92 |
|
| 93 |
try:
|
|
|
|
| 94 |
prompt_for_model = tokenizer.apply_chat_template(
|
| 95 |
messages_for_model_input,
|
| 96 |
tokenize=False,
|
|
|
|
| 99 |
except Exception as e_template:
|
| 100 |
print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
|
| 101 |
prompt_for_model = ""
|
|
|
|
| 102 |
if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
|
| 103 |
prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
|
| 104 |
current_processing_messages = messages_for_model_input[1:]
|
| 105 |
else:
|
| 106 |
+
current_processing_messages = messages_for_model_input
|
| 107 |
|
| 108 |
for msg_data in current_processing_messages:
|
| 109 |
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
| 110 |
|
|
|
|
| 111 |
if not prompt_for_model.strip().endswith("<|assistant|>"):
|
| 112 |
prompt_for_model += "<|assistant|>"
|
| 113 |
|
|
|
|
| 114 |
print(f"--- Sending to Model ---")
|
| 115 |
print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
|
| 116 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
| 117 |
print("------------------------------------")
|
| 118 |
|
| 119 |
+
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE) # model.device could also be used if model is not device_mapped
|
| 120 |
|
| 121 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
| 122 |
+
if not isinstance(eos_token_id_for_generation, int):
|
| 123 |
eos_token_id_for_generation = tokenizer.eos_token_id
|
| 124 |
+
if eos_token_id_for_generation is None:
|
| 125 |
+
print("Warning: EOS token ID for generation is None.")
|
|
|
|
| 126 |
|
| 127 |
with torch.no_grad():
|
| 128 |
outputs = model.generate(
|
| 129 |
**inputs,
|
| 130 |
max_new_tokens=max_new_tokens,
|
| 131 |
+
temperature=max(0.01, temperature),
|
| 132 |
top_p=top_p,
|
| 133 |
do_sample=True if temperature > 0.01 else False,
|
| 134 |
pad_token_id=tokenizer.pad_token_id,
|
| 135 |
eos_token_id=eos_token_id_for_generation
|
| 136 |
)
|
|
|
|
| 137 |
response_ids = outputs[0][inputs.input_ids.shape[1]:]
|
| 138 |
+
decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
|
| 139 |
|
|
|
|
| 140 |
if "<|end|>" in decoded_response:
|
| 141 |
cleaned_response = decoded_response.split("<|end|>")[0].strip()
|
| 142 |
else:
|
|
|
|
| 143 |
cleaned_response = decoded_response.strip()
|
| 144 |
|
|
|
|
| 145 |
if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
|
| 146 |
cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
|
| 147 |
|
| 148 |
+
print(f"Raw decoded model output: {decoded_response}")
|
| 149 |
print(f"Cleaned model output: {cleaned_response}")
|
| 150 |
|
|
|
|
| 151 |
current_response_chunk = ""
|
| 152 |
+
if not cleaned_response:
|
| 153 |
yield ""
|
| 154 |
else:
|
| 155 |
for char_token in cleaned_response:
|
| 156 |
current_response_chunk += char_token
|
| 157 |
yield current_response_chunk
|
|
|
|
|
|
|
| 158 |
|
| 159 |
# --- Gradio Interface ---
|
| 160 |
chatbot_ui = gr.ChatInterface(
|
|
|
|
| 162 |
chatbot=gr.Chatbot(
|
| 163 |
height=600,
|
| 164 |
label="Word Keeper Game (LoRA Powered)",
|
|
|
|
| 165 |
avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
|
| 166 |
),
|
| 167 |
title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
|
|
|
|
| 172 |
[f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
|
| 173 |
["What is the capital of France?"]
|
| 174 |
],
|
| 175 |
+
additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False),
|
| 176 |
additional_inputs=[
|
| 177 |
gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
|
| 178 |
label="System Prompt (How to instruct the AI)",
|
|
|
|
| 181 |
gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
|
| 182 |
gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
|
| 183 |
],
|
|
|
|
| 184 |
)
|
| 185 |
|
| 186 |
if __name__ == "__main__":
|