Spaces:
Sleeping
Sleeping
hashhac
commited on
Commit
·
fe65571
1
Parent(s):
7dc0ac9
no more eos tockens for padding
Browse files
app.py
CHANGED
@@ -54,15 +54,24 @@ def load_llm_model():
|
|
54 |
|
55 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
56 |
|
57 |
-
# Ensure pad token is set
|
58 |
if tokenizer.pad_token is None:
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
model = AutoModelForCausalLM.from_pretrained(
|
62 |
-
model_id,
|
63 |
-
torch_dtype=torch_dtype,
|
64 |
-
low_cpu_mem_usage=True
|
65 |
-
)
|
66 |
model.to(device)
|
67 |
|
68 |
return model, tokenizer
|
@@ -135,25 +144,27 @@ def generate_response(prompt):
|
|
135 |
full_prompt += "Assistant: "
|
136 |
|
137 |
# Generate response with proper attention mask
|
138 |
-
#
|
139 |
-
tokenized_inputs = llm_tokenizer(
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
# Move to device
|
142 |
input_ids = tokenized_inputs["input_ids"].to(device)
|
143 |
-
|
144 |
-
# Create attention mask with 1s for all tokens (no padding)
|
145 |
-
attention_mask = torch.ones_like(input_ids)
|
146 |
|
147 |
# Generate response
|
148 |
with torch.no_grad():
|
149 |
output = llm_model.generate(
|
150 |
input_ids=input_ids,
|
151 |
-
attention_mask=attention_mask,
|
152 |
max_new_tokens=128,
|
153 |
do_sample=True,
|
154 |
temperature=0.7,
|
155 |
-
top_p=0.9
|
156 |
-
pad_token_id=llm_tokenizer.eos_token_id # Explicitly set pad token ID
|
157 |
)
|
158 |
|
159 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
54 |
|
55 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
56 |
|
57 |
+
# Ensure pad token is set to something different than EOS token
|
58 |
if tokenizer.pad_token is None:
|
59 |
+
# Use a different special token as padding token
|
60 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
61 |
+
# Resize the token embeddings since we added a new token
|
62 |
+
model = AutoModelForCausalLM.from_pretrained(
|
63 |
+
model_id,
|
64 |
+
torch_dtype=torch_dtype,
|
65 |
+
low_cpu_mem_usage=True
|
66 |
+
)
|
67 |
+
model.resize_token_embeddings(len(tokenizer))
|
68 |
+
else:
|
69 |
+
model = AutoModelForCausalLM.from_pretrained(
|
70 |
+
model_id,
|
71 |
+
torch_dtype=torch_dtype,
|
72 |
+
low_cpu_mem_usage=True
|
73 |
+
)
|
74 |
|
|
|
|
|
|
|
|
|
|
|
75 |
model.to(device)
|
76 |
|
77 |
return model, tokenizer
|
|
|
144 |
full_prompt += "Assistant: "
|
145 |
|
146 |
# Generate response with proper attention mask
|
147 |
+
# Let the tokenizer create the attention mask automatically
|
148 |
+
tokenized_inputs = llm_tokenizer(
|
149 |
+
full_prompt,
|
150 |
+
return_tensors="pt",
|
151 |
+
padding=True,
|
152 |
+
return_attention_mask=True # This generates the proper attention mask
|
153 |
+
)
|
154 |
|
155 |
# Move to device
|
156 |
input_ids = tokenized_inputs["input_ids"].to(device)
|
157 |
+
attention_mask = tokenized_inputs["attention_mask"].to(device)
|
|
|
|
|
158 |
|
159 |
# Generate response
|
160 |
with torch.no_grad():
|
161 |
output = llm_model.generate(
|
162 |
input_ids=input_ids,
|
163 |
+
attention_mask=attention_mask, # Use the tokenizer's attention mask
|
164 |
max_new_tokens=128,
|
165 |
do_sample=True,
|
166 |
temperature=0.7,
|
167 |
+
top_p=0.9
|
|
|
168 |
)
|
169 |
|
170 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|