Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

5c42f52

1 Parent(s): 70541bf

pad fix

Browse files

Files changed (1) hide show

app.py +32 -32

app.py CHANGED Viewed

@@ -53,36 +53,38 @@ def load_asr_model():
 def load_llm_model():
     model_id = "facebook/opt-1.3b"
-    # First load the tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    # Print current token configuration
     print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
-    # Load the model first
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch_dtype,
         low_cpu_mem_usage=True
     )
-    # Set pad token if needed
-    if tokenizer.pad_token is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
-        # Add a new special token as padding token
-        special_tokens = {'pad_token': '[PAD]'}
-        num_added = tokenizer.add_special_tokens(special_tokens)
-        # Must resize the token embeddings when adding tokens
         model.resize_token_embeddings(len(tokenizer))
-        # Update the model's config to explicitly set the pad token ID
         model.config.pad_token_id = tokenizer.pad_token_id
-        print(f"Added pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
-        print(f"Different from EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
-    else:
-        print(f"Pad token already set: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
-        print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
     # Move model to the right device
     model.to(device)
@@ -267,22 +269,18 @@ def generate_response(prompt):
     full_prompt += "Assistant: "
-    # Generate response with proper attention mask
-    # Ensure padding is done correctly with explicit parameters
-    tokenized_inputs = llm_tokenizer(
-        full_prompt,
-        return_tensors="pt",
-        padding="max_length",
-        max_length=512,  # Fixed length helps with attention masks
-        truncation=True,
-        return_attention_mask=True
-    )
     # Move to device
-    input_ids = tokenized_inputs["input_ids"].to(device)
-    attention_mask = tokenized_inputs["attention_mask"].to(device)
-    # Generate response - explicitly pass all needed parameters
     with torch.no_grad():
         output = llm_model.generate(
             input_ids=input_ids,
@@ -291,14 +289,16 @@ def generate_response(prompt):
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
-            pad_token_id=llm_tokenizer.pad_token_id,  # Explicitly set pad token ID
-            eos_token_id=llm_tokenizer.eos_token_id   # Explicitly set EOS token ID
         )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
     response_text = response_text.split("Assistant: ")[-1].strip()
-    # Add assistant response to history
     chat_history.append({"role": "assistant", "content": response_text})
     # Keep history at a reasonable size

 def load_llm_model():
     model_id = "facebook/opt-1.3b"
+    # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Print initial configuration
     print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
+    # Load model
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch_dtype,
         low_cpu_mem_usage=True
     )
+    # THE KEY FIX: Set pad token consistently in both tokenizer and model config
+    if tokenizer.pad_token_id is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
+        # Define a special token with ID that doesn't conflict
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
         model.resize_token_embeddings(len(tokenizer))
+        # Make sure model config has consistent pad token ID
         model.config.pad_token_id = tokenizer.pad_token_id
+        # Important: Also set these token IDs in model config
+        if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
+            model.config.decoder_start_token_id = tokenizer.pad_token_id
+        print(f"Modified token IDs - PAD: {tokenizer.pad_token_id}, EOS: {tokenizer.eos_token_id}")
+        print(f"Model config - PAD: {model.config.pad_token_id}, EOS: {model.config.eos_token_id}")
+    # Double-check that model config has pad token ID set
+    if not hasattr(model.config, 'pad_token_id') or model.config.pad_token_id is None:
+        model.config.pad_token_id = tokenizer.pad_token_id
     # Move model to the right device
     model.to(device)
     full_prompt += "Assistant: "
+    # Instead of using the tokenizer to create inputs with padding,
+    # let's prepare the inputs differently:
+    input_ids = llm_tokenizer.encode(full_prompt, return_tensors='pt')
+    # Create attention mask manually (all 1's)
+    attention_mask = torch.ones_like(input_ids)
     # Move to device
+    input_ids = input_ids.to(device)
+    attention_mask = attention_mask.to(device)
+    # Generate response with completely explicit parameters
     with torch.no_grad():
         output = llm_model.generate(
             input_ids=input_ids,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
+            pad_token_id=llm_tokenizer.pad_token_id,
+            eos_token_id=llm_tokenizer.eos_token_id,
+            use_cache=True,
+            no_repeat_ngram_size=3
         )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
     response_text = response_text.split("Assistant: ")[-1].strip()
+    # Add assistant response to history
     chat_history.append({"role": "assistant", "content": response_text})
     # Keep history at a reasonable size