Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

fe65571

1 Parent(s): 7dc0ac9

no more eos tockens for padding

Browse files

Files changed (1) hide show

app.py +26 -15

app.py CHANGED Viewed

@@ -54,15 +54,24 @@ def load_llm_model():
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    # Ensure pad token is set
     if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token # Set pad token to end of sequence token
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True
-    )
     model.to(device)
     return model, tokenizer
@@ -135,25 +144,27 @@ def generate_response(prompt):
     full_prompt += "Assistant: "
     # Generate response with proper attention mask
-    # First, tokenize the input text
-    tokenized_inputs = llm_tokenizer(full_prompt, return_tensors="pt", padding=True)
     # Move to device
     input_ids = tokenized_inputs["input_ids"].to(device)
-    # Create attention mask with 1s for all tokens (no padding)
-    attention_mask = torch.ones_like(input_ids)
     # Generate response
     with torch.no_grad():
         output = llm_model.generate(
             input_ids=input_ids,
-            attention_mask=attention_mask,
             max_new_tokens=128,
             do_sample=True,
             temperature=0.7,
-            top_p=0.9,
-            pad_token_id=llm_tokenizer.eos_token_id  # Explicitly set pad token ID
         )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)

     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Ensure pad token is set to something different than EOS token
     if tokenizer.pad_token is None:
+        # Use a different special token as padding token
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        # Resize the token embeddings since we added a new token
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=True
+        )
+        model.resize_token_embeddings(len(tokenizer))
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            low_cpu_mem_usage=True
+        )
     model.to(device)
     return model, tokenizer
     full_prompt += "Assistant: "
     # Generate response with proper attention mask
+    # Let the tokenizer create the attention mask automatically
+    tokenized_inputs = llm_tokenizer(
+        full_prompt,
+        return_tensors="pt",
+        padding=True,
+        return_attention_mask=True  # This generates the proper attention mask
+    )
     # Move to device
     input_ids = tokenized_inputs["input_ids"].to(device)
+    attention_mask = tokenized_inputs["attention_mask"].to(device)
     # Generate response
     with torch.no_grad():
         output = llm_model.generate(
             input_ids=input_ids,
+            attention_mask=attention_mask,  # Use the tokenizer's attention mask
             max_new_tokens=128,
             do_sample=True,
             temperature=0.7,
+            top_p=0.9
         )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)