Spaces:
Sleeping
Sleeping
hashhac
commited on
Commit
·
5c42f52
1
Parent(s):
70541bf
pad fix
Browse files
app.py
CHANGED
@@ -53,36 +53,38 @@ def load_asr_model():
|
|
53 |
def load_llm_model():
|
54 |
model_id = "facebook/opt-1.3b"
|
55 |
|
56 |
-
#
|
57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
58 |
|
59 |
-
# Print
|
60 |
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
61 |
|
62 |
-
# Load
|
63 |
model = AutoModelForCausalLM.from_pretrained(
|
64 |
model_id,
|
65 |
torch_dtype=torch_dtype,
|
66 |
low_cpu_mem_usage=True
|
67 |
)
|
68 |
|
69 |
-
# Set pad token
|
70 |
-
if tokenizer.
|
71 |
-
#
|
72 |
-
|
73 |
-
num_added = tokenizer.add_special_tokens(special_tokens)
|
74 |
-
|
75 |
-
# Must resize the token embeddings when adding tokens
|
76 |
model.resize_token_embeddings(len(tokenizer))
|
77 |
|
78 |
-
#
|
79 |
model.config.pad_token_id = tokenizer.pad_token_id
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
# Move model to the right device
|
88 |
model.to(device)
|
@@ -267,22 +269,18 @@ def generate_response(prompt):
|
|
267 |
|
268 |
full_prompt += "Assistant: "
|
269 |
|
270 |
-
#
|
271 |
-
#
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
max_length=512, # Fixed length helps with attention masks
|
277 |
-
truncation=True,
|
278 |
-
return_attention_mask=True
|
279 |
-
)
|
280 |
|
281 |
# Move to device
|
282 |
-
input_ids =
|
283 |
-
attention_mask =
|
284 |
|
285 |
-
# Generate response
|
286 |
with torch.no_grad():
|
287 |
output = llm_model.generate(
|
288 |
input_ids=input_ids,
|
@@ -291,14 +289,16 @@ def generate_response(prompt):
|
|
291 |
do_sample=True,
|
292 |
temperature=0.7,
|
293 |
top_p=0.9,
|
294 |
-
pad_token_id=llm_tokenizer.pad_token_id,
|
295 |
-
eos_token_id=llm_tokenizer.eos_token_id
|
|
|
|
|
296 |
)
|
297 |
|
298 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
299 |
response_text = response_text.split("Assistant: ")[-1].strip()
|
300 |
|
301 |
-
# Add assistant response to history
|
302 |
chat_history.append({"role": "assistant", "content": response_text})
|
303 |
|
304 |
# Keep history at a reasonable size
|
|
|
53 |
def load_llm_model():
|
54 |
model_id = "facebook/opt-1.3b"
|
55 |
|
56 |
+
# Load tokenizer
|
57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
58 |
|
59 |
+
# Print initial configuration
|
60 |
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
61 |
|
62 |
+
# Load model
|
63 |
model = AutoModelForCausalLM.from_pretrained(
|
64 |
model_id,
|
65 |
torch_dtype=torch_dtype,
|
66 |
low_cpu_mem_usage=True
|
67 |
)
|
68 |
|
69 |
+
# THE KEY FIX: Set pad token consistently in both tokenizer and model config
|
70 |
+
if tokenizer.pad_token_id is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
|
71 |
+
# Define a special token with ID that doesn't conflict
|
72 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
|
|
|
|
|
|
73 |
model.resize_token_embeddings(len(tokenizer))
|
74 |
|
75 |
+
# Make sure model config has consistent pad token ID
|
76 |
model.config.pad_token_id = tokenizer.pad_token_id
|
77 |
|
78 |
+
# Important: Also set these token IDs in model config
|
79 |
+
if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
|
80 |
+
model.config.decoder_start_token_id = tokenizer.pad_token_id
|
81 |
+
|
82 |
+
print(f"Modified token IDs - PAD: {tokenizer.pad_token_id}, EOS: {tokenizer.eos_token_id}")
|
83 |
+
print(f"Model config - PAD: {model.config.pad_token_id}, EOS: {model.config.eos_token_id}")
|
84 |
+
|
85 |
+
# Double-check that model config has pad token ID set
|
86 |
+
if not hasattr(model.config, 'pad_token_id') or model.config.pad_token_id is None:
|
87 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
88 |
|
89 |
# Move model to the right device
|
90 |
model.to(device)
|
|
|
269 |
|
270 |
full_prompt += "Assistant: "
|
271 |
|
272 |
+
# Instead of using the tokenizer to create inputs with padding,
|
273 |
+
# let's prepare the inputs differently:
|
274 |
+
input_ids = llm_tokenizer.encode(full_prompt, return_tensors='pt')
|
275 |
+
|
276 |
+
# Create attention mask manually (all 1's)
|
277 |
+
attention_mask = torch.ones_like(input_ids)
|
|
|
|
|
|
|
|
|
278 |
|
279 |
# Move to device
|
280 |
+
input_ids = input_ids.to(device)
|
281 |
+
attention_mask = attention_mask.to(device)
|
282 |
|
283 |
+
# Generate response with completely explicit parameters
|
284 |
with torch.no_grad():
|
285 |
output = llm_model.generate(
|
286 |
input_ids=input_ids,
|
|
|
289 |
do_sample=True,
|
290 |
temperature=0.7,
|
291 |
top_p=0.9,
|
292 |
+
pad_token_id=llm_tokenizer.pad_token_id,
|
293 |
+
eos_token_id=llm_tokenizer.eos_token_id,
|
294 |
+
use_cache=True,
|
295 |
+
no_repeat_ngram_size=3
|
296 |
)
|
297 |
|
298 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
299 |
response_text = response_text.split("Assistant: ")[-1].strip()
|
300 |
|
301 |
+
# Add assistant response to history
|
302 |
chat_history.append({"role": "assistant", "content": response_text})
|
303 |
|
304 |
# Keep history at a reasonable size
|