Spaces:
Sleeping
Sleeping
hashhac
commited on
Commit
·
190ab02
1
Parent(s):
5c42f52
testing 3
Browse files
app.py
CHANGED
|
@@ -53,42 +53,43 @@ def load_asr_model():
|
|
| 53 |
def load_llm_model():
|
| 54 |
model_id = "facebook/opt-1.3b"
|
| 55 |
|
| 56 |
-
# Load tokenizer
|
| 57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 58 |
|
| 59 |
# Print initial configuration
|
| 60 |
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
| 61 |
|
| 62 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
model = AutoModelForCausalLM.from_pretrained(
|
| 64 |
model_id,
|
| 65 |
torch_dtype=torch_dtype,
|
| 66 |
low_cpu_mem_usage=True
|
| 67 |
)
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
-
# Define a special token with ID that doesn't conflict
|
| 72 |
-
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
| 73 |
-
model.resize_token_embeddings(len(tokenizer))
|
| 74 |
-
|
| 75 |
-
# Make sure model config has consistent pad token ID
|
| 76 |
-
model.config.pad_token_id = tokenizer.pad_token_id
|
| 77 |
-
|
| 78 |
-
# Important: Also set these token IDs in model config
|
| 79 |
-
if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
|
| 80 |
-
model.config.decoder_start_token_id = tokenizer.pad_token_id
|
| 81 |
-
|
| 82 |
-
print(f"Modified token IDs - PAD: {tokenizer.pad_token_id}, EOS: {tokenizer.eos_token_id}")
|
| 83 |
-
print(f"Model config - PAD: {model.config.pad_token_id}, EOS: {model.config.eos_token_id}")
|
| 84 |
|
| 85 |
-
#
|
| 86 |
-
|
| 87 |
-
model.config.pad_token_id = tokenizer.pad_token_id
|
| 88 |
|
| 89 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
model.to(device)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
| 92 |
return model, tokenizer
|
| 93 |
|
| 94 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
|
@@ -257,7 +258,7 @@ def generate_response(prompt):
|
|
| 257 |
# Add user message to history
|
| 258 |
chat_history.append({"role": "user", "content": prompt})
|
| 259 |
|
| 260 |
-
#
|
| 261 |
full_prompt = ""
|
| 262 |
for message in chat_history:
|
| 263 |
if message["role"] == "system":
|
|
@@ -269,39 +270,62 @@ def generate_response(prompt):
|
|
| 269 |
|
| 270 |
full_prompt += "Assistant: "
|
| 271 |
|
| 272 |
-
#
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
-
# Create attention mask
|
| 277 |
-
attention_mask = torch.ones_like(input_ids)
|
| 278 |
|
| 279 |
-
#
|
| 280 |
-
|
| 281 |
-
attention_mask = attention_mask.to(device)
|
| 282 |
|
| 283 |
-
# Generate
|
| 284 |
with torch.no_grad():
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
|
|
|
| 298 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
| 299 |
response_text = response_text.split("Assistant: ")[-1].strip()
|
| 300 |
|
| 301 |
-
# Add assistant response to history
|
| 302 |
chat_history.append({"role": "assistant", "content": response_text})
|
| 303 |
|
| 304 |
-
# Keep history
|
| 305 |
if len(chat_history) > 10:
|
| 306 |
# Keep system message and last 9 exchanges
|
| 307 |
chat_history.pop(1)
|
|
|
|
| 53 |
def load_llm_model():
|
| 54 |
model_id = "facebook/opt-1.3b"
|
| 55 |
|
| 56 |
+
# Load tokenizer with special attention to the padding token
|
| 57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 58 |
|
| 59 |
# Print initial configuration
|
| 60 |
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
| 61 |
|
| 62 |
+
# For OPT models specifically - configure tokenizer before loading model
|
| 63 |
+
if tokenizer.pad_token is None:
|
| 64 |
+
# Use a completely different token as pad token - must be done before model loading
|
| 65 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
| 66 |
+
# Ensure pad token is really different from EOS token
|
| 67 |
+
assert tokenizer.pad_token_id != tokenizer.eos_token_id, "Pad token still same as EOS token!"
|
| 68 |
+
print(f"Added special PAD token with ID {tokenizer.pad_token_id} (different from EOS: {tokenizer.eos_token_id})")
|
| 69 |
+
|
| 70 |
+
# Load model with the knowledge that tokenizer may have been modified
|
| 71 |
model = AutoModelForCausalLM.from_pretrained(
|
| 72 |
model_id,
|
| 73 |
torch_dtype=torch_dtype,
|
| 74 |
low_cpu_mem_usage=True
|
| 75 |
)
|
| 76 |
|
| 77 |
+
# Resize embeddings to match tokenizer
|
| 78 |
+
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
# CRITICAL: Make sure model config knows about the pad token
|
| 81 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
|
|
|
| 82 |
|
| 83 |
+
# OPT models need this explicit configuration
|
| 84 |
+
if hasattr(model.config, "word_embed_proj_dim"):
|
| 85 |
+
model.config._remove_wrong_keys = False
|
| 86 |
+
|
| 87 |
+
# Move model to device
|
| 88 |
model.to(device)
|
| 89 |
|
| 90 |
+
print(f"Final token setup - Pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
|
| 91 |
+
print(f"Model config pad_token_id: {model.config.pad_token_id}")
|
| 92 |
+
|
| 93 |
return model, tokenizer
|
| 94 |
|
| 95 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
|
|
|
| 258 |
# Add user message to history
|
| 259 |
chat_history.append({"role": "user", "content": prompt})
|
| 260 |
|
| 261 |
+
# Build full prompt from chat history
|
| 262 |
full_prompt = ""
|
| 263 |
for message in chat_history:
|
| 264 |
if message["role"] == "system":
|
|
|
|
| 270 |
|
| 271 |
full_prompt += "Assistant: "
|
| 272 |
|
| 273 |
+
# Use encode_plus which offers more control
|
| 274 |
+
encoded_input = llm_tokenizer.encode_plus(
|
| 275 |
+
full_prompt,
|
| 276 |
+
return_tensors="pt",
|
| 277 |
+
padding=False, # Don't pad here - we'll handle it manually
|
| 278 |
+
add_special_tokens=True,
|
| 279 |
+
return_attention_mask=True
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# Extract and move tensors to device
|
| 283 |
+
input_ids = encoded_input["input_ids"].to(device)
|
| 284 |
|
| 285 |
+
# Create attention mask explicitly - all 1s for a non-padded sequence
|
| 286 |
+
attention_mask = torch.ones_like(input_ids).to(device)
|
| 287 |
|
| 288 |
+
# Print for debugging
|
| 289 |
+
print(f"Input shape: {input_ids.shape}, Attention mask shape: {attention_mask.shape}")
|
|
|
|
| 290 |
|
| 291 |
+
# Generate with very explicit parameters for OPT models
|
| 292 |
with torch.no_grad():
|
| 293 |
+
try:
|
| 294 |
+
output = llm_model.generate(
|
| 295 |
+
input_ids=input_ids,
|
| 296 |
+
attention_mask=attention_mask, # Explicitly pass attention mask
|
| 297 |
+
max_new_tokens=128,
|
| 298 |
+
do_sample=True,
|
| 299 |
+
temperature=0.7,
|
| 300 |
+
top_p=0.9,
|
| 301 |
+
pad_token_id=llm_tokenizer.pad_token_id, # Explicitly set pad token ID
|
| 302 |
+
eos_token_id=llm_tokenizer.eos_token_id, # Explicitly set EOS token ID
|
| 303 |
+
use_cache=True,
|
| 304 |
+
no_repeat_ngram_size=3,
|
| 305 |
+
# Add these parameters specifically for OPT
|
| 306 |
+
forced_bos_token_id=None,
|
| 307 |
+
forced_eos_token_id=None,
|
| 308 |
+
num_beams=1 # Simple greedy decoding with temperature
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
print(f"Error during generation: {e}")
|
| 313 |
+
# Fallback with simpler parameters
|
| 314 |
+
output = llm_model.generate(
|
| 315 |
+
input_ids=input_ids,
|
| 316 |
+
max_new_tokens=128,
|
| 317 |
+
do_sample=True,
|
| 318 |
+
temperature=0.7
|
| 319 |
+
)
|
| 320 |
|
| 321 |
+
# Decode only the generated part (not the input)
|
| 322 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
| 323 |
response_text = response_text.split("Assistant: ")[-1].strip()
|
| 324 |
|
| 325 |
+
# Add assistant response to history
|
| 326 |
chat_history.append({"role": "assistant", "content": response_text})
|
| 327 |
|
| 328 |
+
# Keep history manageable
|
| 329 |
if len(chat_history) > 10:
|
| 330 |
# Keep system message and last 9 exchanges
|
| 331 |
chat_history.pop(1)
|