Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,8 +7,8 @@ import os
|
|
7 |
# --- Configuration ---
|
8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
10 |
-
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<<
|
11 |
-
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
|
13 |
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
14 |
|
@@ -20,52 +20,60 @@ if tokenizer.pad_token is None:
|
|
20 |
tokenizer.padding_side = "right"
|
21 |
print("Tokenizer loaded.")
|
22 |
|
23 |
-
|
24 |
-
OFFLOAD_FOLDER = "./model_offload_dir" # Name it as you like
|
25 |
if not os.path.exists(OFFLOAD_FOLDER):
|
26 |
try:
|
27 |
os.makedirs(OFFLOAD_FOLDER)
|
28 |
print(f"Created offload folder: {OFFLOAD_FOLDER}")
|
29 |
except OSError as e:
|
30 |
-
print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER}: {e}.
|
31 |
-
|
32 |
-
# On HF Spaces, you usually have write permission in /home/user/app/ or /tmp/
|
33 |
-
OFFLOAD_FOLDER = "/tmp/model_offload_dir" # Try /tmp as an alternative
|
34 |
if not os.path.exists(OFFLOAD_FOLDER):
|
35 |
try:
|
36 |
os.makedirs(OFFLOAD_FOLDER)
|
37 |
print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
|
38 |
except OSError as e_tmp:
|
39 |
-
print(f"CRITICAL: Could not create any offload folder. Offloading will fail: {e_tmp}")
|
40 |
-
#
|
41 |
|
42 |
print(f"Using offload folder: {OFFLOAD_FOLDER}")
|
43 |
|
44 |
-
|
45 |
-
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE}")
|
46 |
base_model = AutoModelForCausalLM.from_pretrained(
|
47 |
BASE_MODEL_ID,
|
48 |
torch_dtype=torch.float32,
|
49 |
-
device_map="auto",
|
50 |
trust_remote_code=True,
|
51 |
attn_implementation="eager",
|
52 |
-
offload_folder=OFFLOAD_FOLDER
|
53 |
)
|
54 |
-
print("Base model loaded.")
|
|
|
55 |
|
56 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
57 |
try:
|
58 |
-
|
59 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
model.eval()
|
61 |
print("Adapter loaded and model is ready.")
|
|
|
|
|
62 |
except Exception as e:
|
63 |
print(f"CRITICAL ERROR loading adapter: {e}")
|
64 |
-
print(f"
|
|
|
|
|
65 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
66 |
|
67 |
|
68 |
-
# --- Chat Logic ---
|
69 |
def respond(
|
70 |
message: str,
|
71 |
history: list[tuple[str | None, str | None]],
|
@@ -108,15 +116,21 @@ def respond(
|
|
108 |
for msg_data in current_processing_messages:
|
109 |
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
110 |
|
111 |
-
if not prompt_for_model.strip().endswith("<|assistant|>"):
|
112 |
prompt_for_model += "<|assistant|>"
|
113 |
|
|
|
114 |
print(f"--- Sending to Model ---")
|
115 |
print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
|
116 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
117 |
print("------------------------------------")
|
118 |
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
122 |
if not isinstance(eos_token_id_for_generation, int):
|
@@ -124,6 +138,7 @@ def respond(
|
|
124 |
if eos_token_id_for_generation is None:
|
125 |
print("Warning: EOS token ID for generation is None.")
|
126 |
|
|
|
127 |
with torch.no_grad():
|
128 |
outputs = model.generate(
|
129 |
**inputs,
|
@@ -156,7 +171,7 @@ def respond(
|
|
156 |
current_response_chunk += char_token
|
157 |
yield current_response_chunk
|
158 |
|
159 |
-
# --- Gradio Interface ---
|
160 |
chatbot_ui = gr.ChatInterface(
|
161 |
fn=respond,
|
162 |
chatbot=gr.Chatbot(
|
|
|
7 |
# --- Configuration ---
|
8 |
BASE_MODEL_ID = "microsoft/Phi-4-mini-instruct"
|
9 |
# MANDATORY: REPLACE with YOUR Hugging Face username and the adapter ID you pushed
|
10 |
+
ADAPTER_MODEL_ID = "aaurelions/phi4-word-keeper-lora" # <<< USING YOUR EXAMPLE, ENSURE THIS IS CORRECT AND PUBLIC/ACCESSIBLE
|
11 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Will be 'cpu'
|
12 |
|
13 |
SECRET_WORD_PHRASE_CORE_FOR_EXAMPLE_BUTTON = "programmers who eat Italian food say"
|
14 |
|
|
|
20 |
tokenizer.padding_side = "right"
|
21 |
print("Tokenizer loaded.")
|
22 |
|
23 |
+
OFFLOAD_FOLDER = "./model_offload_cache" # Using a consistent name
|
|
|
24 |
if not os.path.exists(OFFLOAD_FOLDER):
|
25 |
try:
|
26 |
os.makedirs(OFFLOAD_FOLDER)
|
27 |
print(f"Created offload folder: {OFFLOAD_FOLDER}")
|
28 |
except OSError as e:
|
29 |
+
print(f"Warning: Could not create offload folder {OFFLOAD_FOLDER} in current dir: {e}. Trying /tmp.")
|
30 |
+
OFFLOAD_FOLDER = "/tmp/model_offload_cache_wordkeeper" # More unique name for /tmp
|
|
|
|
|
31 |
if not os.path.exists(OFFLOAD_FOLDER):
|
32 |
try:
|
33 |
os.makedirs(OFFLOAD_FOLDER)
|
34 |
print(f"Created offload folder in /tmp: {OFFLOAD_FOLDER}")
|
35 |
except OSError as e_tmp:
|
36 |
+
print(f"CRITICAL: Could not create any offload folder. Offloading will fail if needed: {e_tmp}")
|
37 |
+
# If this happens, the app likely won't work if offloading is required.
|
38 |
|
39 |
print(f"Using offload folder: {OFFLOAD_FOLDER}")
|
40 |
|
41 |
+
print(f"Loading base model: {BASE_MODEL_ID} on {DEVICE} with device_map='auto'")
|
|
|
42 |
base_model = AutoModelForCausalLM.from_pretrained(
|
43 |
BASE_MODEL_ID,
|
44 |
torch_dtype=torch.float32,
|
45 |
+
device_map="auto", # This will try to fit on CPU, and offload if it can't
|
46 |
trust_remote_code=True,
|
47 |
attn_implementation="eager",
|
48 |
+
offload_folder=OFFLOAD_FOLDER
|
49 |
)
|
50 |
+
print("Base model loaded with device_map and offload_folder.")
|
51 |
+
print(f"Base model device map: {base_model.hf_device_map}") # See what accelerate decided
|
52 |
|
53 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
54 |
try:
|
55 |
+
# Load the PEFT model.
|
56 |
+
# It should respect the base_model's device_map and offload_folder settings.
|
57 |
+
# No need to pass device_map or offload_folder to PeftModel directly
|
58 |
+
# if the base model is already configured.
|
59 |
+
model = PeftModel.from_pretrained(
|
60 |
+
base_model,
|
61 |
+
ADAPTER_MODEL_ID,
|
62 |
+
# adapter_name="default" # Default adapter name
|
63 |
+
)
|
64 |
model.eval()
|
65 |
print("Adapter loaded and model is ready.")
|
66 |
+
print(f"PEFT model device map (should match base or be compatible): {model.hf_device_map}")
|
67 |
+
|
68 |
except Exception as e:
|
69 |
print(f"CRITICAL ERROR loading adapter: {e}")
|
70 |
+
print(f"Adapter ID used: '{ADAPTER_MODEL_ID}'")
|
71 |
+
print(f"Base model device map was: {base_model.hf_device_map if 'base_model' in locals() and hasattr(base_model, 'hf_device_map') else 'N/A'}")
|
72 |
+
print(f"Offload folder was: {OFFLOAD_FOLDER}")
|
73 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
74 |
|
75 |
|
76 |
+
# --- Chat Logic (remains the same as your last full version) ---
|
77 |
def respond(
|
78 |
message: str,
|
79 |
history: list[tuple[str | None, str | None]],
|
|
|
116 |
for msg_data in current_processing_messages:
|
117 |
prompt_for_model += f"<|{msg_data['role']}|>\n{msg_data['content']}<|end|>\n"
|
118 |
|
119 |
+
if not prompt_for_model.strip().endswith("<|assistant|>"): # Check before adding
|
120 |
prompt_for_model += "<|assistant|>"
|
121 |
|
122 |
+
|
123 |
print(f"--- Sending to Model ---")
|
124 |
print(f"System Prompt (passed to model if not empty): {active_system_prompt_for_log}")
|
125 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
126 |
print("------------------------------------")
|
127 |
|
128 |
+
# Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
|
129 |
+
# For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
|
130 |
+
# input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
|
131 |
+
# However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
|
132 |
+
# So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
|
133 |
+
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
134 |
|
135 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
136 |
if not isinstance(eos_token_id_for_generation, int):
|
|
|
138 |
if eos_token_id_for_generation is None:
|
139 |
print("Warning: EOS token ID for generation is None.")
|
140 |
|
141 |
+
|
142 |
with torch.no_grad():
|
143 |
outputs = model.generate(
|
144 |
**inputs,
|
|
|
171 |
current_response_chunk += char_token
|
172 |
yield current_response_chunk
|
173 |
|
174 |
+
# --- Gradio Interface (remains the same as your last full version) ---
|
175 |
chatbot_ui = gr.ChatInterface(
|
176 |
fn=respond,
|
177 |
chatbot=gr.Chatbot(
|