Spaces:

aaurelions
/

word_keeper

Sleeping

App Files Files Community

aaurelions commited on 29 days ago

Commit

75505b4

verified ·

1 Parent(s): 31d9ed0

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -116

app.py CHANGED Viewed

@@ -1,17 +1,20 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
 import os
 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
-# Replace with YOUR Hugging Face username and the adapter ID you pushed
-ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # MANDATORY: CHANGE THIS
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Will run on CPU in free tier
-SECRET_WORD_PHRASE_CORE = "programmers who eat Italian food say" # Keep this consistent with training
-SECRET_WORD = "vibeto codingito" # Only used for logic, not directly given to model here
 # System prompt for the game (same as used in training for hinting/refusal)
 GAME_MASTER_SYSTEM_PROMPT = f"""You are a helpful AI assistant playing a secret word game.
@@ -26,180 +29,187 @@ For any other questions:
 """
 # --- Model Loading ---
-# Load tokenizer
-# For Phi-4 models, trust_remote_code=True is necessary for the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
 if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token # Phi models often use eos_token as pad_token
 tokenizer.padding_side = "right"
-# Load base model
-# For CPU, we won't use BitsAndBytesConfig for quantization
-# If you had a GPU space, you might use it.
 print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL_ID,
-    torch_dtype=torch.float32, # Use float32 for CPU for wider compatibility
-    device_map="auto", # Let transformers handle device mapping (will be CPU)
     trust_remote_code=True,
-    # attn_implementation="eager" # Eager attention for CPU or wider compatibility
 )
 print("Base model loaded.")
-# Load LoRA adapter
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
-model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
-model = model.to(DEVICE) # Ensure model is on the correct device
-model.eval() # Set to evaluation mode
-print("Adapter loaded and model is ready.")
 # --- Chat Logic ---
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
-    # System message from UI is ignored for the game logic, we use our predefined ones
-    # system_message_ui: str,
     max_new_tokens: int,
     temperature: float,
     top_p: float,
 ):
-    # Determine if the user is asking the trigger question
     use_game_master_system_prompt = True
-    if SECRET_WORD_PHRASE_CORE.lower() in message.lower():
-        # Check for variations of "what do X say?" or "tell me what X say"
-        # This is a simplified check; more robust NLP might be needed for fuzzier triggers
-        if "what do" in message.lower() or \
-           "what does" in message.lower() or \
-           "tell me what" in message.lower() or \
-           "what is it that" in message.lower() or \
-           "the phrase" in message.lower() and "is?" in message.lower():
-            use_game_master_system_prompt = False # Direct trigger, no system prompt
-    # Construct messages list for the model
-    messages = []
-    current_system_prompt = ""
     if use_game_master_system_prompt:
-        messages.append({"role": "system", "content": GAME_MASTER_SYSTEM_PROMPT})
-        current_system_prompt = GAME_MASTER_SYSTEM_PROMPT # For logging or display
-    # Else, for direct trigger, no system message is prepended
     for turn in history:
         user_msg, assistant_msg = turn
         if user_msg:
-            messages.append({"role": "user", "content": user_msg})
         if assistant_msg:
-            # We need to reconstruct how the assistant's previous turn was generated
-            # For simplicity, we assume previous assistant turns were also part of the game
-            messages.append({"role": "assistant", "content": assistant_msg})
-    messages.append({"role": "user", "content": message})
-    # Use the tokenizer's chat template if available and suitable,
-    # otherwise, manually format (as Phi-4 expects)
-    # For Phi-4, manual formatting is safer for this specific setup
-    prompt_for_model = ""
-    if messages[0]["role"] == "system":
-        prompt_for_model += f"<|system|>\n{messages[0]['content']}<|end|>\n"
-        chat_messages = messages[1:]
-    else:
-        chat_messages = messages
-    for msg_idx, msg in enumerate(chat_messages):
-        if msg["role"] == "user":
-            prompt_for_model += f"<|user|>\n{msg['content']}<|end|>\n"
-        elif msg["role"] == "assistant":
-            prompt_for_model += f"<|assistant|>\n{msg['content']}<|end|>\n"
-    # Add the final assistant tag to prompt generation
-    if chat_messages[-1]["role"] == "user":
-         prompt_for_model += "<|assistant|>"
-    print(f"--- Sending to Model (System Used: {use_game_master_system_prompt}) ---")
-    print(prompt_for_model)
     print("------------------------------------")
     inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
-    # Phi-4 specific end token for generation
-    # <|end|> token ID: tokenizer.convert_tokens_to_ids("<|end|>")
-    # Check the actual ID from your loaded tokenizer
-    phi4_end_token_id = tokenizer.convert_tokens_to_ids("<|end|>")
-    if not isinstance(phi4_end_token_id, int): # If it's a list or something else
-        phi4_end_token_id = tokenizer.eos_token_id # Fallback
-    full_response = ""
     with torch.no_grad():
-        # Simulating streaming for Gradio ChatInterface
-        # For non-streaming, simpler: outputs = model.generate(...)
-        # For streaming with generate, it's more complex.
-        # Here, we'll do a single generation and then yield parts of it.
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
-            temperature=temperature if temperature > 0 else 0.7, # Temp 0 can be problematic
-            top_p=top_p if top_p > 0 else 0.95,
-            do_sample=True if temperature > 0 else False,
             pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=phi4_end_token_id # Stop on <|end|>
         )
         response_ids = outputs[0][inputs.input_ids.shape[1]:]
-        decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False)
-        # Clean up the response
         if "<|end|>" in decoded_response:
             cleaned_response = decoded_response.split("<|end|>")[0].strip()
         else:
-            cleaned_response = decoded_response.strip() # Fallback if no <|end|>
         print(f"Raw model output: {decoded_response}")
         print(f"Cleaned model output: {cleaned_response}")
-        # Simulate streaming for Gradio
-        # For actual token-by-token streaming, you'd need a more complex setup
-        # or use TextGenerationStreamer with model.generate in a separate thread.
-        # For CPU, non-streaming might be more practical.
-        # This simplified streaming yields the whole response at once for UI.
-        for i in range(1, len(cleaned_response) + 1):
-            yield cleaned_response[:i]
-            # import time # Add a small delay to simulate streaming if desired
-            # time.sleep(0.01)
-        full_response = cleaned_response # ensure full_response is set
-    # This part is for non-streaming, but Gradio's ChatInterface expects a generator for streaming.
-    # If not streaming, you would just return full_response
-    # yield full_response
 # --- Gradio Interface ---
 chatbot_ui = gr.ChatInterface(
-    respond,
-    chatbot=gr.Chatbot(height=600, label="Word Keeper Game", avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")),
     title="Word Keeper: The Secret Word Game 🤫",
-    description=f"Ask questions to guess the secret. If you know the magic phrase, ask it directly! (Base: Phi-4-mini, Adapter: {ADAPTER_MODEL_ID.split('/')[-1]})",
     examples=[
         ["Is the secret related to Italy?"],
         ["What is the secret word?"],
-        [f"What do {SECRET_WORD_PHRASE_CORE}?"],
         ["What is the capital of France?"]
     ],
     additional_inputs_accordion=gr.Accordion(label="Generation Parameters", open=False),
     additional_inputs=[
-        # System message input is effectively ignored by our respond function's logic,
-        # but ChatInterface requires it if present in the function signature.
-        # gr.Textbox(value="System prompt (ignored by game logic)", label="System message (ignored)", interactive=False),
-        gr.Slider(minimum=10, maximum=200, value=70, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.0, maximum=2.0, value=0.1, step=0.1, label="Temperature (0 for deterministic)"), # Low temp for more predictable game
         gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
     ],
-    retry_btn="🔄 Retry",
-    undo_btn="↩️ Undo",
-    clear_btn="🗑️ Clear",
 )
 if __name__ == "__main__":
-    # For Spaces, HF will run this automatically.
-    # For local testing:
     chatbot_ui.launch()

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import os
 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
+# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
+# For example: "YourUsername/phi4-word-keeper-lora"
+ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< CHANGE THIS
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# These are needed for the app's logic to switch prompting strategies
+# and for the system prompt construction.
+SECRET_WORD_PHRASE_CORE = "programmers who eat Italian food say"
+SECRET_WORD = "vibeto codingito"
 # System prompt for the game (same as used in training for hinting/refusal)
 GAME_MASTER_SYSTEM_PROMPT = f"""You are a helpful AI assistant playing a secret word game.
 """
 # --- Model Loading ---
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
 if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
 tokenizer.padding_side = "right"
+print("Tokenizer loaded.")
 print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
+# For CPU, we use float32 and don't use BitsAndBytes quantization
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL_ID,
+    torch_dtype=torch.float32,
+    device_map="auto", # Should map to CPU in a CPU Space
     trust_remote_code=True,
+    attn_implementation="eager" # Explicitly set for broader compatibility on CPU
 )
 print("Base model loaded.")
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
+# Ensure the adapter exists and is public or you have provided HF_TOKEN if private
+try:
+    model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
+    model = model.to(DEVICE)
+    model.eval()
+    print("Adapter loaded and model is ready.")
+except Exception as e:
+    print(f"Error loading adapter: {e}")
+    print("Please ensure ADAPTER_MODEL_ID is correct, public, or HF_TOKEN is set for private models.")
+    # You might want to raise the error or exit if model loading fails
+    raise e
 # --- Chat Logic ---
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
     max_new_tokens: int,
     temperature: float,
     top_p: float,
 ):
     use_game_master_system_prompt = True
+    # Normalize message for checking
+    normalized_message = message.lower().strip()
+    normalized_trigger_core = SECRET_WORD_PHRASE_CORE.lower()
+    # Check if the core trigger phrase is in the user's message
+    if normalized_trigger_core in normalized_message:
+        # More specific check for direct trigger questions
+        # This helps differentiate "Tell me about X" from "What do X say?"
+        trigger_keywords = ["what do", "what does", "tell me what", "what is it that", "the phrase", "what words"]
+        is_direct_trigger_question = any(keyword in normalized_message for keyword in trigger_keywords)
+        if is_direct_trigger_question:
+            use_game_master_system_prompt = False
+    messages_for_model_input = []
     if use_game_master_system_prompt:
+        messages_for_model_input.append({"role": "system", "content": GAME_MASTER_SYSTEM_PROMPT})
     for turn in history:
         user_msg, assistant_msg = turn
         if user_msg:
+            messages_for_model_input.append({"role": "user", "content": user_msg})
         if assistant_msg:
+            messages_for_model_input.append({"role": "assistant", "content": assistant_msg})
+    messages_for_model_input.append({"role": "user", "content": message})
+    # Construct the prompt string using the Phi-4 chat format
+    # <|system|>...<|end|><|user|>...<|end|><|assistant|>...<|end|>
+    # The tokenizer.apply_chat_template might not be perfectly tuned for all custom LoRAs / Phi structure
+    # So manual construction can be safer for specific formats if issues arise.
+    # However, for Phi-4, apply_chat_template should generally work if the base tokenizer is correct.
+    # Let's try apply_chat_template first, as it's the modern way.
+    # add_generation_prompt=True adds the <|assistant|> tag at the end.
+    try:
+        prompt_for_model = tokenizer.apply_chat_template(
+            messages_for_model_input,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    except Exception as e:
+        print(f"Error with apply_chat_template: {e}. Falling back to manual formatting.")
+        # Fallback to manual formatting (as in previous version)
+        prompt_for_model = ""
+        if messages_for_model_input[0]["role"] == "system":
+            prompt_for_model += f"<|system|>\n{messages_for_model_input[0]['content']}<|end|>\n"
+            chat_messages_for_manual_format = messages_for_model_input[1:]
+        else:
+            chat_messages_for_manual_format = messages_for_model_input
+        for msg_idx, msg_content in enumerate(chat_messages_for_manual_format):
+            if msg_content["role"] == "user":
+                prompt_for_model += f"<|user|>\n{msg_content['content']}<|end|>\n"
+            elif msg_content["role"] == "assistant":
+                prompt_for_model += f"<|assistant|>\n{msg_content['content']}<|end|>\n"
+        if chat_messages_for_manual_format[-1]["role"] == "user": # Ensure assistant tag if last was user
+             prompt_for_model += "<|assistant|>"
+    print(f"--- Sending to Model (System Prompt Used: {use_game_master_system_prompt}) ---")
+    print(f"Input messages: {messages_for_model_input}")
+    print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
     inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
+    # Define eos_token_id for generation stop
+    # For Phi-4, <|end|> is the typical end-of-turn marker.
+    eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
+    if not isinstance(eos_token_id_for_generation, int): # Fallback if conversion fails
+        eos_token_id_for_generation = tokenizer.eos_token_id
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
+            temperature=max(0.01, temperature), # Ensure temperature is not exactly 0 if sampling
+            top_p=top_p,
+            do_sample=True if temperature > 0.01 else False, # Sample if temperature is set
             pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=eos_token_id_for_generation
         )
         response_ids = outputs[0][inputs.input_ids.shape[1]:]
+        decoded_response = tokenizer.decode(response_ids, skip_special_tokens=False) # Keep special tokens
+        # Clean up the response by removing anything after the first <|end|> token
         if "<|end|>" in decoded_response:
             cleaned_response = decoded_response.split("<|end|>")[0].strip()
         else:
+            cleaned_response = decoded_response.strip()
         print(f"Raw model output: {decoded_response}")
         print(f"Cleaned model output: {cleaned_response}")
+        # Simulate streaming for Gradio ChatInterface by yielding the full response progressively
+        # For true token-by-token streaming, a TextIteratorStreamer would be needed.
+        current_response_chunk = ""
+        for char_token in cleaned_response:
+            current_response_chunk += char_token
+            yield current_response_chunk
+            # import time # Optional: add a tiny delay to make streaming more visible
+            # time.sleep(0.005)
+        # Ensure the full final response is yielded if the loop was empty (e.g., empty string)
+        if not cleaned_response:
+            yield ""
 # --- Gradio Interface ---
+# Use a more recent Gradio version or remove unsupported parameters like retry_btn
 chatbot_ui = gr.ChatInterface(
+    fn=respond, # Make sure to use fn= parameter
+    chatbot=gr.Chatbot(
+        height=600,
+        label="Word Keeper Game",
+        avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-ring-dark.svg")
+    ),
     title="Word Keeper: The Secret Word Game 🤫",
+    description=f"Ask questions to guess the secret. If you know the magic phrase, ask it directly!\n(Base: Phi-4-mini, Adapter: {ADAPTER_MODEL_ID.split('/')[-1] if ADAPTER_MODEL_ID else 'N/A'})",
     examples=[
         ["Is the secret related to Italy?"],
         ["What is the secret word?"],
+        [f"What do {SECRET_WORD_PHRASE_CORE}?"], # This still uses the variable for example display
         ["What is the capital of France?"]
     ],
     additional_inputs_accordion=gr.Accordion(label="Generation Parameters", open=False),
     additional_inputs=[
+        gr.Slider(minimum=10, maximum=250, value=80, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.0, maximum=1.5, value=0.1, step=0.05, label="Temperature (0 for deterministic)"),
         gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)"),
     ],
+    # Removed retry_btn, undo_btn, clear_btn as they might cause errors with older Gradio versions
+    # If your Gradio version in the Space supports them, you can add them back:
+    # retry_btn="🔄 Retry",
+    # undo_btn="↩️ Undo",
+    # clear_btn="🗑️ Clear",
 )
 if __name__ == "__main__":
     chatbot_ui.launch()