hashhac commited on
Commit
fe65571
·
1 Parent(s): 7dc0ac9

no more eos tockens for padding

Browse files
Files changed (1) hide show
  1. app.py +26 -15
app.py CHANGED
@@ -54,15 +54,24 @@ def load_llm_model():
54
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
 
57
- # Ensure pad token is set
58
  if tokenizer.pad_token is None:
59
- tokenizer.pad_token = tokenizer.eos_token # Set pad token to end of sequence token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- model = AutoModelForCausalLM.from_pretrained(
62
- model_id,
63
- torch_dtype=torch_dtype,
64
- low_cpu_mem_usage=True
65
- )
66
  model.to(device)
67
 
68
  return model, tokenizer
@@ -135,25 +144,27 @@ def generate_response(prompt):
135
  full_prompt += "Assistant: "
136
 
137
  # Generate response with proper attention mask
138
- # First, tokenize the input text
139
- tokenized_inputs = llm_tokenizer(full_prompt, return_tensors="pt", padding=True)
 
 
 
 
 
140
 
141
  # Move to device
142
  input_ids = tokenized_inputs["input_ids"].to(device)
143
-
144
- # Create attention mask with 1s for all tokens (no padding)
145
- attention_mask = torch.ones_like(input_ids)
146
 
147
  # Generate response
148
  with torch.no_grad():
149
  output = llm_model.generate(
150
  input_ids=input_ids,
151
- attention_mask=attention_mask,
152
  max_new_tokens=128,
153
  do_sample=True,
154
  temperature=0.7,
155
- top_p=0.9,
156
- pad_token_id=llm_tokenizer.eos_token_id # Explicitly set pad token ID
157
  )
158
 
159
  response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
 
54
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
 
57
+ # Ensure pad token is set to something different than EOS token
58
  if tokenizer.pad_token is None:
59
+ # Use a different special token as padding token
60
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
61
+ # Resize the token embeddings since we added a new token
62
+ model = AutoModelForCausalLM.from_pretrained(
63
+ model_id,
64
+ torch_dtype=torch_dtype,
65
+ low_cpu_mem_usage=True
66
+ )
67
+ model.resize_token_embeddings(len(tokenizer))
68
+ else:
69
+ model = AutoModelForCausalLM.from_pretrained(
70
+ model_id,
71
+ torch_dtype=torch_dtype,
72
+ low_cpu_mem_usage=True
73
+ )
74
 
 
 
 
 
 
75
  model.to(device)
76
 
77
  return model, tokenizer
 
144
  full_prompt += "Assistant: "
145
 
146
  # Generate response with proper attention mask
147
+ # Let the tokenizer create the attention mask automatically
148
+ tokenized_inputs = llm_tokenizer(
149
+ full_prompt,
150
+ return_tensors="pt",
151
+ padding=True,
152
+ return_attention_mask=True # This generates the proper attention mask
153
+ )
154
 
155
  # Move to device
156
  input_ids = tokenized_inputs["input_ids"].to(device)
157
+ attention_mask = tokenized_inputs["attention_mask"].to(device)
 
 
158
 
159
  # Generate response
160
  with torch.no_grad():
161
  output = llm_model.generate(
162
  input_ids=input_ids,
163
+ attention_mask=attention_mask, # Use the tokenizer's attention mask
164
  max_new_tokens=128,
165
  do_sample=True,
166
  temperature=0.7,
167
+ top_p=0.9
 
168
  )
169
 
170
  response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)