Spaces:

aaurelions
/

word_keeper

Sleeping

App Files Files Community

aaurelions commited on 25 days ago

Commit

156da66

verified ·

1 Parent(s): b437018

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -44

app.py CHANGED Viewed

@@ -7,13 +7,9 @@ import os
 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
 # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
-# For example: "YourUsername/phi4-word-keeper-lora"
 ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# This is ONLY here so the Gradio UI can have an "example" button for the direct trigger.
-# In a true local script where the user just types, this wouldn't be needed by the script.
-# The LoRA itself "knows" this phrase implicitly based on its training.
 SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
 # --- Model Loading ---
@@ -24,26 +20,48 @@ if tokenizer.pad_token is None:
 tokenizer.padding_side = "right"
 print("Tokenizer loaded.")
 print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL_ID,
-    torch_dtype=torch.float32, # float32 for CPU
-    device_map="auto",         # Handles CPU mapping
     trust_remote_code=True,
-    attn_implementation="eager" # Good for compatibility, esp. on CPU
 )
 print("Base model loaded.")
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
 try:
     model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
-    model = model.to(DEVICE)
     model.eval()
     print("Adapter loaded and model is ready.")
 except Exception as e:
     print(f"CRITICAL ERROR loading adapter: {e}")
     print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
-    # In a real deployment, you might want the app to exit or display an error state
     raise RuntimeError(f"Failed to load LoRA adapter: {e}")
@@ -51,24 +69,17 @@ except Exception as e:
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
-    user_system_prompt: str | None = "You are a helpful AI assistant.", # Default for function signature
-    max_new_tokens: int = 80,             # Default for function signature
-    temperature: float = 0.7,             # Default for function signature
-    top_p: float = 0.9,                   # Default for function signature
 ):
     messages_for_model_input = []
     active_system_prompt_for_log = "None (or direct trigger by LoRA)"
-    # Use the system prompt provided by the user, if any, and it's not empty
     if user_system_prompt and user_system_prompt.strip():
         messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
         active_system_prompt_for_log = user_system_prompt.strip()
-    # The direct trigger (e.g., "What do programmers...") was trained WITHOUT a system prompt.
-    # If the user types the trigger, the LoRA should ideally respond with the secret word
-    # even if a generic system prompt like "You are a helper" is active.
-    # The strength of the fine-tuning for that specific trigger (without a system prompt in its training data)
-    # is key here. This script no longer tries to explicitly remove the system prompt for triggers.
     for turn in history:
         user_msg, assistant_msg = turn
@@ -80,7 +91,6 @@ def respond(
     messages_for_model_input.append({"role": "user", "content": message})
     try:
-        # add_generation_prompt=True adds the <|assistant|> tag at the end for generation.
         prompt_for_model = tokenizer.apply_chat_template(
             messages_for_model_input,
             tokenize=False,
@@ -89,73 +99,62 @@ def respond(
     except Exception as e_template:
         print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
         prompt_for_model = ""
-        # Manual fallback construction
         if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
             prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
             current_processing_messages = messages_for_model_input[1:]
         else:
-            current_processing_messages = messages_for_model_input # No system prompt or already handled
         for msg_data in current_processing_messages:
             prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
-        # Ensure assistant tag is present if needed for generation
         if not prompt_for_model.strip().endswith("<|assistant|>"):
              prompt_for_model += "<|assistant|>"
     print(f"--- Sending to Model ---")
     print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
     print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
-    inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
     eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
-    if not isinstance(eos_token_id_for_generation, int): # Fallback if special token not found or conversion weird
         eos_token_id_for_generation = tokenizer.eos_token_id
-    if eos_token_id_for_generation is None: # Ultimate fallback
-        print("Warning: EOS token ID for generation is None. Generation might not stop correctly.")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
-            temperature=max(0.01, temperature), # temp 0 can be ill-defined for sampling
             top_p=top_p,
             do_sample=True if temperature > 0.01 else False,
             pad_token_id=tokenizer.pad_token_id,
             eos_token_id=eos_token_id_for_generation
         )
-        # Slice generated tokens (excluding prompt tokens)
         response_ids = outputs[0][inputs.input_ids.shape[1]:]
-        decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False) # Keep special tokens like <|end|>
-        # Clean up the response by removing anything after the first <|end|> token
         if "<|end|>" in decoded_response:
             cleaned_response = decoded_response.split("<|end|>")[0].strip()
         else:
-            # If no <|end|> is found (e.g., max_tokens reached before <|end|>)
             cleaned_response = decoded_response.strip()
-        # Further cleanup: sometimes models add an extra eos if it's the same as pad
         if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
             cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
-        print(f"Raw decoded model output: {decoded_response}") # For debugging
         print(f"Cleaned model output: {cleaned_response}")
-        # Simulate streaming for Gradio ChatInterface
         current_response_chunk = ""
-        if not cleaned_response: # Handle empty response
             yield ""
         else:
             for char_token in cleaned_response:
                 current_response_chunk += char_token
                 yield current_response_chunk
-                # import time # Optional: to make streaming more visible
-                # time.sleep(0.005)
 # --- Gradio Interface ---
 chatbot_ui = gr.ChatInterface(
@@ -163,7 +162,6 @@ chatbot_ui = gr.ChatInterface(
     chatbot=gr.Chatbot(
         height=600,
         label="Word Keeper Game (LoRA Powered)",
-        # Example avatar for assistant, replace with your own or remove
         avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
     ),
     title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
@@ -174,7 +172,7 @@ chatbot_ui = gr.ChatInterface(
         [f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
         ["What is the capital of France?"]
     ],
-    additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False), # Start closed
     additional_inputs=[
         gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
                    label="System Prompt (How to instruct the AI)",
@@ -183,7 +181,6 @@ chatbot_ui = gr.ChatInterface(
         gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
         gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
     ],
-    # Removed retry_btn etc. for broader Gradio version compatibility. Add back if your Space's Gradio supports them.
 )
 if __name__ == "__main__":

 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
 # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
 ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
 # --- Model Loading ---
 tokenizer.padding_side = "right"
 print("Tokenizer loaded.")
+# Define an offload folder for accelerate if layers need to be moved off CPU RAM temporarily
+OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
+if not os.path.exists(OFFLOAD_FOLDER):
+    try:
+        os.makedirs(OFFLOAD_FOLDER)
+        print(f"Created offload folder: {OFFLOAD_FOLDER}")
+    except OSError as e:
+        print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}. Offloading might fail if needed.")
+        # If offloading is strictly necessary, this could still be an issue.
+        # On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
+        OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
+        if not os.path.exists(OFFLOAD_FOLDER):
+            try:
+                os.makedirs(OFFLOAD_FOLDER)
+                print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
+            except OSError as e_tmp:
+                 print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
+                 # Consider raising an error here if offloading is essential for your model size vs RAM
+print(f"Using offload folder: {OFFLOAD_FOLDER}")
 print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL_ID,
+    torch_dtype=torch.float32,
+    device_map="auto",
     trust_remote_code=True,
+    attn_implementation="eager",
+    offload_folder=OFFLOAD_FOLDER  # Provide the offload directory
 )
 print("Base model loaded.")
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
 try:
     model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
+    # The PeftModel inherits the device_map and offload settings from the base_model
     model.eval()
     print("Adapter loaded and model is ready.")
 except Exception as e:
     print(f"CRITICAL ERROR loading adapter: {e}")
     print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
     raise RuntimeError(f"Failed to load LoRA adapter: {e}")
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
+    user_system_prompt: str | None = "You are a helpful AI assistant.",
+    max_new_tokens: int = 80,
+    temperature: float = 0.7,
+    top_p: float = 0.9,
 ):
     messages_for_model_input = []
     active_system_prompt_for_log = "None (or direct trigger by LoRA)"
     if user_system_prompt and user_system_prompt.strip():
         messages_for_model_input.append({"role": "system", "content": user_system_prompt.strip()})
         active_system_prompt_for_log = user_system_prompt.strip()
     for turn in history:
         user_msg, assistant_msg = turn
     messages_for_model_input.append({"role": "user", "content": message})
     try:
         prompt_for_model = tokenizer.apply_chat_template(
             messages_for_model_input,
             tokenize=False,
     except Exception as e_template:
         print(f"Warning: tokenizer.apply_chat_template failed ({e_template}). Falling back to manual prompt string construction.")
         prompt_for_model = ""
         if messages_for_model_input and messages_for_model_input[0]["role"] == "system":
             prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
             current_processing_messages = messages_for_model_input[1:]
         else:
+            current_processing_messages = messages_for_model_input
         for msg_data in current_processing_messages:
             prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
         if not prompt_for_model.strip().endswith("<|assistant|>"):
              prompt_for_model += "<|assistant|>"
     print(f"--- Sending to Model ---")
     print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
     print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
+    inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE) # model.device could also be used if model is not device_mapped
     eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
+    if not isinstance(eos_token_id_for_generation, int):
         eos_token_id_for_generation = tokenizer.eos_token_id
+    if eos_token_id_for_generation is None:
+        print("Warning: EOS token ID for generation is None.")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
+            temperature=max(0.01, temperature),
             top_p=top_p,
             do_sample=True if temperature > 0.01 else False,
             pad_token_id=tokenizer.pad_token_id,
             eos_token_id=eos_token_id_for_generation
         )
         response_ids = outputs[0][inputs.input_ids.shape[1]:]
+        decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
         if "<|end|>" in decoded_response:
             cleaned_response = decoded_response.split("<|end|>")[0].strip()
         else:
             cleaned_response = decoded_response.strip()
         if tokenizer.eos_token and cleaned_response.endswith(tokenizer.eos_token):
             cleaned_response = cleaned_response[:-len(tokenizer.eos_token)].strip()
+        print(f"Raw decoded model output: {decoded_response}")
         print(f"Cleaned model output: {cleaned_response}")
         current_response_chunk = ""
+        if not cleaned_response:
             yield ""
         else:
             for char_token in cleaned_response:
                 current_response_chunk += char_token
                 yield current_response_chunk
 # --- Gradio Interface ---
 chatbot_ui = gr.ChatInterface(
     chatbot=gr.Chatbot(
         height=600,
         label="Word Keeper Game (LoRA Powered)",
         avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
     ),
     title="Word Keeper: The Secret Word Game 🤫 (User-Driven)",
         [f"What do {SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON}?"],
         ["What is the capital of France?"]
     ],
+    additional_inputs_accordion=gr.Accordion(label="Chat Settings", open=False),
     additional_inputs=[
         gr.Textbox(value="You are a helpful AI assistant. You have been fine-tuned to play a secret word game. If I ask you to play, engage in that game.",
                    label="System Prompt (How to instruct the AI)",
         gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature"),
         gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
     ],
 )
 if __name__ == "__main__":