Spaces:

Athspi
/

Tttt

Sleeping

Athspi commited on Mar 18

Commit

aa37cb9

verified ·

1 Parent(s): 8c2acb9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,6 +11,10 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto"
 )
 # System prompt
 system_prompt = "You are a friendly assistant named FastLlama."
@@ -25,21 +29,27 @@ def respond(message: str, history: list):
     # Format the prompt with chat history
     full_prompt = format_prompt(message, history)
-    # Tokenize input
-    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
-    # Generate response
     output = model.generate(
         inputs.input_ids,
         max_new_tokens=256,
         temperature=0.7,
         top_p=0.9,
         repetition_penalty=1.1,
         do_sample=True,
-        pad_token_id=tokenizer.eos_token_id
     )
-    # Decode response
     response = tokenizer.decode(
         output[0][inputs.input_ids.shape[-1]:],
         skip_special_tokens=True
@@ -60,6 +70,5 @@ chat = gr.ChatInterface(
     cache_examples=False
 )
-# Launch the app
 if __name__ == "__main__":
     chat.launch(server_name="0.0.0.0")

     device_map="auto"
 )
+# Explicitly set padding token
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
 # System prompt
 system_prompt = "You are a friendly assistant named FastLlama."
     # Format the prompt with chat history
     full_prompt = format_prompt(message, history)
+    # Tokenize input with attention mask
+    inputs = tokenizer(
+        full_prompt,
+        return_tensors="pt",
+        padding=True,
+        truncation=True
+    ).to(model.device)
+    # Generate response with attention mask
     output = model.generate(
         inputs.input_ids,
+        attention_mask=inputs.attention_mask,
         max_new_tokens=256,
         temperature=0.7,
         top_p=0.9,
         repetition_penalty=1.1,
         do_sample=True,
+        pad_token_id=tokenizer.pad_token_id
     )
+    # Decode response while skipping special tokens
     response = tokenizer.decode(
         output[0][inputs.input_ids.shape[-1]:],
         skip_special_tokens=True
     cache_examples=False
 )
 if __name__ == "__main__":
     chat.launch(server_name="0.0.0.0")