hashhac commited on
Commit
190ab02
·
1 Parent(s): 5c42f52
Files changed (1) hide show
  1. app.py +69 -45
app.py CHANGED
@@ -53,42 +53,43 @@ def load_asr_model():
53
  def load_llm_model():
54
  model_id = "facebook/opt-1.3b"
55
 
56
- # Load tokenizer
57
  tokenizer = AutoTokenizer.from_pretrained(model_id)
58
 
59
  # Print initial configuration
60
  print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
61
 
62
- # Load model
 
 
 
 
 
 
 
 
63
  model = AutoModelForCausalLM.from_pretrained(
64
  model_id,
65
  torch_dtype=torch_dtype,
66
  low_cpu_mem_usage=True
67
  )
68
 
69
- # THE KEY FIX: Set pad token consistently in both tokenizer and model config
70
- if tokenizer.pad_token_id is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
71
- # Define a special token with ID that doesn't conflict
72
- tokenizer.add_special_tokens({'pad_token': '[PAD]'})
73
- model.resize_token_embeddings(len(tokenizer))
74
-
75
- # Make sure model config has consistent pad token ID
76
- model.config.pad_token_id = tokenizer.pad_token_id
77
-
78
- # Important: Also set these token IDs in model config
79
- if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
80
- model.config.decoder_start_token_id = tokenizer.pad_token_id
81
-
82
- print(f"Modified token IDs - PAD: {tokenizer.pad_token_id}, EOS: {tokenizer.eos_token_id}")
83
- print(f"Model config - PAD: {model.config.pad_token_id}, EOS: {model.config.eos_token_id}")
84
 
85
- # Double-check that model config has pad token ID set
86
- if not hasattr(model.config, 'pad_token_id') or model.config.pad_token_id is None:
87
- model.config.pad_token_id = tokenizer.pad_token_id
88
 
89
- # Move model to the right device
 
 
 
 
90
  model.to(device)
91
 
 
 
 
92
  return model, tokenizer
93
 
94
  # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
@@ -257,7 +258,7 @@ def generate_response(prompt):
257
  # Add user message to history
258
  chat_history.append({"role": "user", "content": prompt})
259
 
260
- # Prepare input for the model
261
  full_prompt = ""
262
  for message in chat_history:
263
  if message["role"] == "system":
@@ -269,39 +270,62 @@ def generate_response(prompt):
269
 
270
  full_prompt += "Assistant: "
271
 
272
- # Instead of using the tokenizer to create inputs with padding,
273
- # let's prepare the inputs differently:
274
- input_ids = llm_tokenizer.encode(full_prompt, return_tensors='pt')
 
 
 
 
 
 
 
 
275
 
276
- # Create attention mask manually (all 1's)
277
- attention_mask = torch.ones_like(input_ids)
278
 
279
- # Move to device
280
- input_ids = input_ids.to(device)
281
- attention_mask = attention_mask.to(device)
282
 
283
- # Generate response with completely explicit parameters
284
  with torch.no_grad():
285
- output = llm_model.generate(
286
- input_ids=input_ids,
287
- attention_mask=attention_mask,
288
- max_new_tokens=128,
289
- do_sample=True,
290
- temperature=0.7,
291
- top_p=0.9,
292
- pad_token_id=llm_tokenizer.pad_token_id,
293
- eos_token_id=llm_tokenizer.eos_token_id,
294
- use_cache=True,
295
- no_repeat_ngram_size=3
296
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
 
298
  response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
299
  response_text = response_text.split("Assistant: ")[-1].strip()
300
 
301
- # Add assistant response to history
302
  chat_history.append({"role": "assistant", "content": response_text})
303
 
304
- # Keep history at a reasonable size
305
  if len(chat_history) > 10:
306
  # Keep system message and last 9 exchanges
307
  chat_history.pop(1)
 
53
  def load_llm_model():
54
  model_id = "facebook/opt-1.3b"
55
 
56
+ # Load tokenizer with special attention to the padding token
57
  tokenizer = AutoTokenizer.from_pretrained(model_id)
58
 
59
  # Print initial configuration
60
  print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
61
 
62
+ # For OPT models specifically - configure tokenizer before loading model
63
+ if tokenizer.pad_token is None:
64
+ # Use a completely different token as pad token - must be done before model loading
65
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
66
+ # Ensure pad token is really different from EOS token
67
+ assert tokenizer.pad_token_id != tokenizer.eos_token_id, "Pad token still same as EOS token!"
68
+ print(f"Added special PAD token with ID {tokenizer.pad_token_id} (different from EOS: {tokenizer.eos_token_id})")
69
+
70
+ # Load model with the knowledge that tokenizer may have been modified
71
  model = AutoModelForCausalLM.from_pretrained(
72
  model_id,
73
  torch_dtype=torch_dtype,
74
  low_cpu_mem_usage=True
75
  )
76
 
77
+ # Resize embeddings to match tokenizer
78
+ model.resize_token_embeddings(len(tokenizer))
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ # CRITICAL: Make sure model config knows about the pad token
81
+ model.config.pad_token_id = tokenizer.pad_token_id
 
82
 
83
+ # OPT models need this explicit configuration
84
+ if hasattr(model.config, "word_embed_proj_dim"):
85
+ model.config._remove_wrong_keys = False
86
+
87
+ # Move model to device
88
  model.to(device)
89
 
90
+ print(f"Final token setup - Pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
91
+ print(f"Model config pad_token_id: {model.config.pad_token_id}")
92
+
93
  return model, tokenizer
94
 
95
  # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
 
258
  # Add user message to history
259
  chat_history.append({"role": "user", "content": prompt})
260
 
261
+ # Build full prompt from chat history
262
  full_prompt = ""
263
  for message in chat_history:
264
  if message["role"] == "system":
 
270
 
271
  full_prompt += "Assistant: "
272
 
273
+ # Use encode_plus which offers more control
274
+ encoded_input = llm_tokenizer.encode_plus(
275
+ full_prompt,
276
+ return_tensors="pt",
277
+ padding=False, # Don't pad here - we'll handle it manually
278
+ add_special_tokens=True,
279
+ return_attention_mask=True
280
+ )
281
+
282
+ # Extract and move tensors to device
283
+ input_ids = encoded_input["input_ids"].to(device)
284
 
285
+ # Create attention mask explicitly - all 1s for a non-padded sequence
286
+ attention_mask = torch.ones_like(input_ids).to(device)
287
 
288
+ # Print for debugging
289
+ print(f"Input shape: {input_ids.shape}, Attention mask shape: {attention_mask.shape}")
 
290
 
291
+ # Generate with very explicit parameters for OPT models
292
  with torch.no_grad():
293
+ try:
294
+ output = llm_model.generate(
295
+ input_ids=input_ids,
296
+ attention_mask=attention_mask, # Explicitly pass attention mask
297
+ max_new_tokens=128,
298
+ do_sample=True,
299
+ temperature=0.7,
300
+ top_p=0.9,
301
+ pad_token_id=llm_tokenizer.pad_token_id, # Explicitly set pad token ID
302
+ eos_token_id=llm_tokenizer.eos_token_id, # Explicitly set EOS token ID
303
+ use_cache=True,
304
+ no_repeat_ngram_size=3,
305
+ # Add these parameters specifically for OPT
306
+ forced_bos_token_id=None,
307
+ forced_eos_token_id=None,
308
+ num_beams=1 # Simple greedy decoding with temperature
309
+ )
310
+
311
+ except Exception as e:
312
+ print(f"Error during generation: {e}")
313
+ # Fallback with simpler parameters
314
+ output = llm_model.generate(
315
+ input_ids=input_ids,
316
+ max_new_tokens=128,
317
+ do_sample=True,
318
+ temperature=0.7
319
+ )
320
 
321
+ # Decode only the generated part (not the input)
322
  response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
323
  response_text = response_text.split("Assistant: ")[-1].strip()
324
 
325
+ # Add assistant response to history
326
  chat_history.append({"role": "assistant", "content": response_text})
327
 
328
+ # Keep history manageable
329
  if len(chat_history) > 10:
330
  # Keep system message and last 9 exchanges
331
  chat_history.pop(1)