Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -52,12 +52,12 @@ print(f"Base model device map: {base_model.hf_device_map}") # See what accelerat
|
|
52 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
53 |
try:
|
54 |
# Load the PEFT model.
|
55 |
-
#
|
56 |
-
#
|
57 |
-
# if the base model is already configured.
|
58 |
model = PeftModel.from_pretrained(
|
59 |
base_model,
|
60 |
ADAPTER_MODEL_ID,
|
|
|
61 |
# adapter_name="default" # Default adapter name
|
62 |
)
|
63 |
model.eval()
|
@@ -72,7 +72,7 @@ except Exception as e:
|
|
72 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
73 |
|
74 |
|
75 |
-
# --- Chat Logic
|
76 |
def respond(
|
77 |
message: str,
|
78 |
history: list[tuple[str | None, str | None]],
|
@@ -124,11 +124,6 @@ def respond(
|
|
124 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
125 |
print("------------------------------------")
|
126 |
|
127 |
-
# Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
|
128 |
-
# For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
|
129 |
-
# input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
|
130 |
-
# However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
|
131 |
-
# So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
|
132 |
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
133 |
|
134 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
@@ -170,7 +165,7 @@ def respond(
|
|
170 |
current_response_chunk += char_token
|
171 |
yield current_response_chunk
|
172 |
|
173 |
-
# --- Gradio Interface
|
174 |
chatbot_ui = gr.ChatInterface(
|
175 |
fn=respond,
|
176 |
chatbot=gr.Chatbot(
|
|
|
52 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
53 |
try:
|
54 |
# Load the PEFT model.
|
55 |
+
# Pass offload_folder here as well, as PeftModel's internal dispatching
|
56 |
+
# might need it if accelerate decides to offload parts of the combined model.
|
|
|
57 |
model = PeftModel.from_pretrained(
|
58 |
base_model,
|
59 |
ADAPTER_MODEL_ID,
|
60 |
+
offload_folder=OFFLOAD_FOLDER, # <--- FIX APPLIED HERE
|
61 |
# adapter_name="default" # Default adapter name
|
62 |
)
|
63 |
model.eval()
|
|
|
72 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
73 |
|
74 |
|
75 |
+
# --- Chat Logic ---
|
76 |
def respond(
|
77 |
message: str,
|
78 |
history: list[tuple[str | None, str | None]],
|
|
|
124 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
125 |
print("------------------------------------")
|
126 |
|
|
|
|
|
|
|
|
|
|
|
127 |
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
128 |
|
129 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
|
|
165 |
current_response_chunk += char_token
|
166 |
yield current_response_chunk
|
167 |
|
168 |
+
# --- Gradio Interface ---
|
169 |
chatbot_ui = gr.ChatInterface(
|
170 |
fn=respond,
|
171 |
chatbot=gr.Chatbot(
|