aaurelions commited on
Commit
b718a2b
·
verified ·
1 Parent(s): 6ba0a9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -22
app.py CHANGED
@@ -7,8 +7,8 @@ import os
7
  # --- Configuration ---
8
  BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
9
  # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
10
- ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< YOU MUST CHANGE THIS
11
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
  SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
14
 
@@ -20,52 +20,60 @@ if tokenizer.pad_token is None:
20
  tokenizer.padding_side = "right"
21
  print("Tokenizer loaded.")
22
 
23
- # Define an offload folder for accelerate if layers need to be moved off CPU RAM temporarily
24
- OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
25
  if not os.path.exists(OFFLOAD_FOLDER):
26
  try:
27
  os.makedirs(OFFLOAD_FOLDER)
28
  print(f"Created offload folder: {OFFLOAD_FOLDER}")
29
  except OSError as e:
30
- print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}. Offloading might fail if needed.")
31
- # If offloading is strictly necessary, this could still be an issue.
32
- # On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
33
- OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
34
  if not os.path.exists(OFFLOAD_FOLDER):
35
  try:
36
  os.makedirs(OFFLOAD_FOLDER)
37
  print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
38
  except OSError as e_tmp:
39
- print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
40
- # Consider raising an error here if offloading is essential for your model size vs RAM
41
 
42
  print(f"Using offload folder: {OFFLOAD_FOLDER}")
43
 
44
-
45
- print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
46
  base_model = AutoModelForCausalLM.from_pretrained(
47
  BASE_MODEL_ID,
48
  torch_dtype=torch.float32,
49
- device_map="auto",
50
  trust_remote_code=True,
51
  attn_implementation="eager",
52
- offload_folder=OFFLOAD_FOLDER # Provide the offload directory
53
  )
54
- print("Base model loaded.")
 
55
 
56
  print(f"Loading adapter: {ADAPTER_MODEL_ID}")
57
  try:
58
- model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID)
59
- # The PeftModel inherits the device_map and offload settings from the base_model
 
 
 
 
 
 
 
60
  model.eval()
61
  print("Adapter loaded and model is ready.")
 
 
62
  except Exception as e:
63
  print(f"CRITICAL ERROR loading adapter: {e}")
64
- print(f"Please ensure ADAPTER_MODEL_ID ('{ADAPTER_MODEL_ID}') is correct, public, or HF_TOKEN is set for private models.")
 
 
65
  raise RuntimeError(f"Failed to load LoRA adapter: {e}")
66
 
67
 
68
- # --- Chat Logic ---
69
  def respond(
70
  message: str,
71
  history: list[tuple[str | None, str | None]],
@@ -108,15 +116,21 @@ def respond(
108
  for msg_data in current_processing_messages:
109
  prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
110
 
111
- if not prompt_for_model.strip().endswith("<|assistant|>"):
112
  prompt_for_model += "<|assistant|>"
113
 
 
114
  print(f"--- Sending to Model ---")
115
  print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
116
  print(f"Formatted prompt for model:\n{prompt_for_model}")
117
  print("------------------------------------")
118
 
119
- inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE) # model.device could also be used if model is not device_mapped
 
 
 
 
 
120
 
121
  eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
122
  if not isinstance(eos_token_id_for_generation, int):
@@ -124,6 +138,7 @@ def respond(
124
  if eos_token_id_for_generation is None:
125
  print("Warning: EOS token ID for generation is None.")
126
 
 
127
  with torch.no_grad():
128
  outputs = model.generate(
129
  **inputs,
@@ -156,7 +171,7 @@ def respond(
156
  current_response_chunk += char_token
157
  yield current_response_chunk
158
 
159
- # --- Gradio Interface ---
160
  chatbot_ui = gr.ChatInterface(
161
  fn=respond,
162
  chatbot=gr.Chatbot(
 
7
  # --- Configuration ---
8
  BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
9
  # MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
10
+ ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< USING YOUR EXAMPLE, ENSURE THIS IS CORRECT AND PUBLIC/ACCESSIBLE
11
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Will be 'cpu'
12
 
13
  SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
14
 
 
20
  tokenizer.padding_side = "right"
21
  print("Tokenizer loaded.")
22
 
23
+ OFFLOAD_FOLDER = "./model_offload_cache" # Using a consistent name
 
24
  if not os.path.exists(OFFLOAD_FOLDER):
25
  try:
26
  os.makedirs(OFFLOAD_FOLDER)
27
  print(f"Created offload folder: {OFFLOAD_FOLDER}")
28
  except OSError as e:
29
+ print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER} in current dir: {e}. Trying /tmp.")
30
+ OFFLOAD_FOLDER = "/tmp/model_offload_cache_wordkeeper" # More unique name for /tmp
 
 
31
  if not os.path.exists(OFFLOAD_FOLDER):
32
  try:
33
  os.makedirs(OFFLOAD_FOLDER)
34
  print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
35
  except OSError as e_tmp:
36
+ print(f"CRITICAL: Could not create any offload folder. Offloading will fail if needed: {e_tmp}")
37
+ # If this happens, the app likely won't work if offloading is required.
38
 
39
  print(f"Using offload folder: {OFFLOAD_FOLDER}")
40
 
41
+ print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE} with device_map='auto'")
 
42
  base_model = AutoModelForCausalLM.from_pretrained(
43
  BASE_MODEL_ID,
44
  torch_dtype=torch.float32,
45
+ device_map="auto", # This will try to fit on CPU, and offload if it can't
46
  trust_remote_code=True,
47
  attn_implementation="eager",
48
+ offload_folder=OFFLOAD_FOLDER
49
  )
50
+ print("Base model loaded with device_map and offload_folder.")
51
+ print(f"Base model device map: {base_model.hf_device_map}") # See what accelerate decided
52
 
53
  print(f"Loading adapter: {ADAPTER_MODEL_ID}")
54
  try:
55
+ # Load the PEFT model.
56
+ # It should respect the base_model's device_map and offload_folder settings.
57
+ # No need to pass device_map or offload_folder to PeftModel directly
58
+ # if the base model is already configured.
59
+ model = PeftModel.from_pretrained(
60
+ base_model,
61
+ ADAPTER_MODEL_ID,
62
+ # adapter_name="default" # Default adapter name
63
+ )
64
  model.eval()
65
  print("Adapter loaded and model is ready.")
66
+ print(f"PEFT model device map (should match base or be compatible): {model.hf_device_map}")
67
+
68
  except Exception as e:
69
  print(f"CRITICAL ERROR loading adapter: {e}")
70
+ print(f"Adapter ID used: '{ADAPTER_MODEL_ID}'")
71
+ print(f"Base model device map was: {base_model.hf_device_map if 'base_model' in locals() and hasattr(base_model, 'hf_device_map') else 'N/A'}")
72
+ print(f"Offload folder was: {OFFLOAD_FOLDER}")
73
  raise RuntimeError(f"Failed to load LoRA adapter: {e}")
74
 
75
 
76
+ # --- Chat Logic (remains the same as your last full version) ---
77
  def respond(
78
  message: str,
79
  history: list[tuple[str | None, str | None]],
 
116
  for msg_data in current_processing_messages:
117
  prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
118
 
119
+ if not prompt_for_model.strip().endswith("<|assistant|>"): # Check before adding
120
  prompt_for_model += "<|assistant|>"
121
 
122
+
123
  print(f"--- Sending to Model ---")
124
  print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
125
  print(f"Formatted prompt for model:\n{prompt_for_model}")
126
  print("------------------------------------")
127
 
128
+ # Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
129
+ # For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
130
+ # input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
131
+ # However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
132
+ # So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
133
+ inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
134
 
135
  eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
136
  if not isinstance(eos_token_id_for_generation, int):
 
138
  if eos_token_id_for_generation is None:
139
  print("Warning: EOS token ID for generation is None.")
140
 
141
+
142
  with torch.no_grad():
143
  outputs = model.generate(
144
  **inputs,
 
171
  current_response_chunk += char_token
172
  yield current_response_chunk
173
 
174
+ # --- Gradio Interface (remains the same as your last full version) ---
175
  chatbot_ui = gr.ChatInterface(
176
  fn=respond,
177
  chatbot=gr.Chatbot(