aaurelions commited on
Commit
de55550
·
verified ·
1 Parent(s): f836522

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -10
app.py CHANGED
@@ -52,12 +52,12 @@ print(f"Base model device map: {base_model.hf_device_map}") # See what accelerat
52
  print(f"Loading adapter: {ADAPTER_MODEL_ID}")
53
  try:
54
  # Load the PEFT model.
55
- # It should respect the base_model's device_map and offload_folder settings.
56
- # No need to pass device_map or offload_folder to PeftModel directly
57
- # if the base model is already configured.
58
  model = PeftModel.from_pretrained(
59
  base_model,
60
  ADAPTER_MODEL_ID,
 
61
  # adapter_name="default" # Default adapter name
62
  )
63
  model.eval()
@@ -72,7 +72,7 @@ except Exception as e:
72
  raise RuntimeError(f"Failed to load LoRA adapter: {e}")
73
 
74
 
75
- # --- Chat Logic (remains the same as your last full version) ---
76
  def respond(
77
  message: str,
78
  history: list[tuple[str | None, str | None]],
@@ -124,11 +124,6 @@ def respond(
124
  print(f"Formatted prompt for model:\n{prompt_for_model}")
125
  print("------------------------------------")
126
 
127
- # Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
128
- # For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
129
- # input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
130
- # However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
131
- # So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
132
  inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
133
 
134
  eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
@@ -170,7 +165,7 @@ def respond(
170
  current_response_chunk += char_token
171
  yield current_response_chunk
172
 
173
- # --- Gradio Interface (remains the same as your last full version) ---
174
  chatbot_ui = gr.ChatInterface(
175
  fn=respond,
176
  chatbot=gr.Chatbot(
 
52
  print(f"Loading adapter: {ADAPTER_MODEL_ID}")
53
  try:
54
  # Load the PEFT model.
55
+ # Pass offload_folder here as well, as PeftModel's internal dispatching
56
+ # might need it if accelerate decides to offload parts of the combined model.
 
57
  model = PeftModel.from_pretrained(
58
  base_model,
59
  ADAPTER_MODEL_ID,
60
+ offload_folder=OFFLOAD_FOLDER, # <--- FIX APPLIED HERE
61
  # adapter_name="default" # Default adapter name
62
  )
63
  model.eval()
 
72
  raise RuntimeError(f"Failed to load LoRA adapter: {e}")
73
 
74
 
75
+ # --- Chat Logic ---
76
  def respond(
77
  message: str,
78
  history: list[tuple[str | None, str | None]],
 
124
  print(f"Formatted prompt for model:\n{prompt_for_model}")
125
  print("------------------------------------")
126
 
 
 
 
 
 
127
  inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
128
 
129
  eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
 
165
  current_response_chunk += char_token
166
  yield current_response_chunk
167
 
168
+ # --- Gradio Interface ---
169
  chatbot_ui = gr.ChatInterface(
170
  fn=respond,
171
  chatbot=gr.Chatbot(