Spaces:

aaurelions
/

word_keeper

Sleeping

App Files Files Community

aaurelions commited on 27 days ago

Commit

b718a2b

verified ·

1 Parent(s): 6ba0a9b

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -22

app.py CHANGED Viewed

@@ -7,8 +7,8 @@ import os
 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
 # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
-ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
@@ -20,52 +20,60 @@ if tokenizer.pad_token is None:
 tokenizer.padding_side = "right"
 print("Tokenizer loaded.")
-# Define an offload folder for accelerate if layers need to be moved off CPU RAM temporarily
-OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
 if not os.path.exists(OFFLOAD_FOLDER):
     try:
         os.makedirs(OFFLOAD_FOLDER)
         print(f"Created offload folder: {OFFLOAD_FOLDER}")
     except OSError as e:
-        print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}. Offloading might fail if needed.")
-        # If offloading is strictly necessary, this could still be an issue.
-        # On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
-        OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
         if not os.path.exists(OFFLOAD_FOLDER):
             try:
                 os.makedirs(OFFLOAD_FOLDER)
                 print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
             except OSError as e_tmp:
-                 print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
-                 # Consider raising an error here if offloading is essential for your model size vs RAM
 print(f"Using offload folder: {OFFLOAD_FOLDER}")
-print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL_ID,
     torch_dtype=torch.float32,
-    device_map="auto",
     trust_remote_code=True,
     attn_implementation="eager",
-    offload_folder=OFFLOAD_FOLDER  # Provide the offload directory
 )
-print("Base model loaded.")
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
 try:
-    model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
-    # The PeftModel inherits the device_map and offload settings from the base_model
     model.eval()
     print("Adapter loaded and model is ready.")
 except Exception as e:
     print(f"CRITICAL ERROR loading adapter: {e}")
-    print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
     raise RuntimeError(f"Failed to load LoRA adapter: {e}")
-# --- Chat Logic ---
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
@@ -108,15 +116,21 @@ def respond(
         for msg_data in current_processing_messages:
             prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
-        if not prompt_for_model.strip().endswith("<|assistant|>"):
              prompt_for_model += "<|assistant|>"
     print(f"--- Sending to Model ---")
     print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
     print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
-    inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE) # model.device could also be used if model is not device_mapped
     eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
     if not isinstance(eos_token_id_for_generation, int):
@@ -124,6 +138,7 @@ def respond(
     if eos_token_id_for_generation is None:
         print("Warning: EOS token ID for generation is None.")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -156,7 +171,7 @@ def respond(
                 current_response_chunk += char_token
                 yield current_response_chunk
-# --- Gradio Interface ---
 chatbot_ui = gr.ChatInterface(
     fn=respond,
     chatbot=gr.Chatbot(

 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
 # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
+ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< USING YOUR EXAMPLE, ENSURE THIS IS CORRECT AND PUBLIC/ACCESSIBLE
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Will be 'cpu'
 SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
 tokenizer.padding_side = "right"
 print("Tokenizer loaded.")
+OFFLOAD_FOLDER = "./model_offload_cache" # Using a consistent name
 if not os.path.exists(OFFLOAD_FOLDER):
     try:
         os.makedirs(OFFLOAD_FOLDER)
         print(f"Created offload folder: {OFFLOAD_FOLDER}")
     except OSError as e:
+        print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER} in current dir: {e}. Trying /tmp.")
+        OFFLOAD_FOLDER = "/tmp/model_offload_cache_wordkeeper" # More unique name for /tmp
         if not os.path.exists(OFFLOAD_FOLDER):
             try:
                 os.makedirs(OFFLOAD_FOLDER)
                 print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
             except OSError as e_tmp:
+                 print(f"CRITICAL: Could not create any offload folder. Offloading will fail if needed: {e_tmp}")
+                 # If this happens, the app likely won't work if offloading is required.
 print(f"Using offload folder: {OFFLOAD_FOLDER}")
+print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE} with device_map='auto'")
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL_ID,
     torch_dtype=torch.float32,
+    device_map="auto", # This will try to fit on CPU, and offload if it can't
     trust_remote_code=True,
     attn_implementation="eager",
+    offload_folder=OFFLOAD_FOLDER
 )
+print("Base model loaded with device_map and offload_folder.")
+print(f"Base model device map: {base_model.hf_device_map}") # See what accelerate decided
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
 try:
+    # Load the PEFT model.
+    # It should respect the base_model's device_map and offload_folder settings.
+    # No need to pass device_map or offload_folder to PeftModel directly
+    # if the base model is already configured.
+    model = PeftModel.from_pretrained(
+        base_model,
+        ADAPTER_MODEL_ID,
+        # adapter_name="default" # Default adapter name
+    )
     model.eval()
     print("Adapter loaded and model is ready.")
+    print(f"PEFT model device map (should match base or be compatible): {model.hf_device_map}")
 except Exception as e:
     print(f"CRITICAL ERROR loading adapter: {e}")
+    print(f"Adapter ID used: '{ADAPTER_MODEL_ID}'")
+    print(f"Base model device map was: {base_model.hf_device_map if 'base_model' in locals() and hasattr(base_model, 'hf_device_map') else 'N/A'}")
+    print(f"Offload folder was: {OFFLOAD_FOLDER}")
     raise RuntimeError(f"Failed to load LoRA adapter: {e}")
+# --- Chat Logic (remains the same as your last full version) ---
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
         for msg_data in current_processing_messages:
             prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
+        if not prompt_for_model.strip().endswith("<|assistant|>"): # Check before adding
              prompt_for_model += "<|assistant|>"
     print(f"--- Sending to Model ---")
     print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
     print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
+    # Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
+    # For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
+    # input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
+    # However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
+    # So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
+    inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
     eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
     if not isinstance(eos_token_id_for_generation, int):
     if eos_token_id_for_generation is None:
         print("Warning: EOS token ID for generation is None.")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
                 current_response_chunk += char_token
                 yield current_response_chunk
+# --- Gradio Interface (remains the same as your last full version) ---
 chatbot_ui = gr.ChatInterface(
     fn=respond,
     chatbot=gr.Chatbot(