Spaces:

bobpopboom
/

chaty

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

190ab02

1 Parent(s): 5c42f52

testing 3

Browse files

Files changed (1) hide show

app.py +69 -45

app.py CHANGED Viewed

@@ -53,42 +53,43 @@ def load_asr_model():
 def load_llm_model():
     model_id = "facebook/opt-1.3b"
-    # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     # Print initial configuration
     print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
-    # Load model
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch_dtype,
         low_cpu_mem_usage=True
     )
-    # THE KEY FIX: Set pad token consistently in both tokenizer and model config
-    if tokenizer.pad_token_id is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
-        # Define a special token with ID that doesn't conflict
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-        model.resize_token_embeddings(len(tokenizer))
-        # Make sure model config has consistent pad token ID
-        model.config.pad_token_id = tokenizer.pad_token_id
-        # Important: Also set these token IDs in model config
-        if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
-            model.config.decoder_start_token_id = tokenizer.pad_token_id
-        print(f"Modified token IDs - PAD: {tokenizer.pad_token_id}, EOS: {tokenizer.eos_token_id}")
-        print(f"Model config - PAD: {model.config.pad_token_id}, EOS: {model.config.eos_token_id}")
-    # Double-check that model config has pad token ID set
-    if not hasattr(model.config, 'pad_token_id') or model.config.pad_token_id is None:
-        model.config.pad_token_id = tokenizer.pad_token_id
-    # Move model to the right device
     model.to(device)
     return model, tokenizer
 # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
@@ -257,7 +258,7 @@ def generate_response(prompt):
     # Add user message to history
     chat_history.append({"role": "user", "content": prompt})
-    # Prepare input for the model
     full_prompt = ""
     for message in chat_history:
         if message["role"] == "system":
@@ -269,39 +270,62 @@ def generate_response(prompt):
     full_prompt += "Assistant: "
-    # Instead of using the tokenizer to create inputs with padding,
-    # let's prepare the inputs differently:
-    input_ids = llm_tokenizer.encode(full_prompt, return_tensors='pt')
-    # Create attention mask manually (all 1's)
-    attention_mask = torch.ones_like(input_ids)
-    # Move to device
-    input_ids = input_ids.to(device)
-    attention_mask = attention_mask.to(device)
-    # Generate response with completely explicit parameters
     with torch.no_grad():
-        output = llm_model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=128,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            pad_token_id=llm_tokenizer.pad_token_id,
-            eos_token_id=llm_tokenizer.eos_token_id,
-            use_cache=True,
-            no_repeat_ngram_size=3
-        )
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
     response_text = response_text.split("Assistant: ")[-1].strip()
-    # Add assistant response to history
     chat_history.append({"role": "assistant", "content": response_text})
-    # Keep history at a reasonable size
     if len(chat_history) > 10:
         # Keep system message and last 9 exchanges
         chat_history.pop(1)

 def load_llm_model():
     model_id = "facebook/opt-1.3b"
+    # Load tokenizer with special attention to the padding token
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     # Print initial configuration
     print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
+    # For OPT models specifically - configure tokenizer before loading model
+    if tokenizer.pad_token is None:
+        # Use a completely different token as pad token - must be done before model loading
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        # Ensure pad token is really different from EOS token
+        assert tokenizer.pad_token_id != tokenizer.eos_token_id, "Pad token still same as EOS token!"
+        print(f"Added special PAD token with ID {tokenizer.pad_token_id} (different from EOS: {tokenizer.eos_token_id})")
+    # Load model with the knowledge that tokenizer may have been modified
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         torch_dtype=torch_dtype,
         low_cpu_mem_usage=True
     )
+    # Resize embeddings to match tokenizer
+    model.resize_token_embeddings(len(tokenizer))
+    # CRITICAL: Make sure model config knows about the pad token
+    model.config.pad_token_id = tokenizer.pad_token_id
+    # OPT models need this explicit configuration
+    if hasattr(model.config, "word_embed_proj_dim"):
+        model.config._remove_wrong_keys = False
+    # Move model to device
     model.to(device)
+    print(f"Final token setup - Pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
+    print(f"Model config pad_token_id: {model.config.pad_token_id}")
     return model, tokenizer
 # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
     # Add user message to history
     chat_history.append({"role": "user", "content": prompt})
+    # Build full prompt from chat history
     full_prompt = ""
     for message in chat_history:
         if message["role"] == "system":
     full_prompt += "Assistant: "
+    # Use encode_plus which offers more control
+    encoded_input = llm_tokenizer.encode_plus(
+        full_prompt,
+        return_tensors="pt",
+        padding=False,  # Don't pad here - we'll handle it manually
+        add_special_tokens=True,
+        return_attention_mask=True
+    )
+    # Extract and move tensors to device
+    input_ids = encoded_input["input_ids"].to(device)
+    # Create attention mask explicitly - all 1s for a non-padded sequence
+    attention_mask = torch.ones_like(input_ids).to(device)
+    # Print for debugging
+    print(f"Input shape: {input_ids.shape}, Attention mask shape: {attention_mask.shape}")
+    # Generate with very explicit parameters for OPT models
     with torch.no_grad():
+        try:
+            output = llm_model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,  # Explicitly pass attention mask
+                max_new_tokens=128,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                pad_token_id=llm_tokenizer.pad_token_id,  # Explicitly set pad token ID
+                eos_token_id=llm_tokenizer.eos_token_id,  # Explicitly set EOS token ID
+                use_cache=True,
+                no_repeat_ngram_size=3,
+                # Add these parameters specifically for OPT
+                forced_bos_token_id=None,
+                forced_eos_token_id=None,
+                num_beams=1  # Simple greedy decoding with temperature
+            )
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            # Fallback with simpler parameters
+            output = llm_model.generate(
+                input_ids=input_ids,
+                max_new_tokens=128,
+                do_sample=True,
+                temperature=0.7
+            )
+    # Decode only the generated part (not the input)
     response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
     response_text = response_text.split("Assistant: ")[-1].strip()
+    # Add assistant response to history
     chat_history.append({"role": "assistant", "content": response_text})
+    # Keep history manageable
     if len(chat_history) > 10:
         # Keep system message and last 9 exchanges
         chat_history.pop(1)