Spaces:
Sleeping
Sleeping
hashhac
commited on
Commit
·
190ab02
1
Parent(s):
5c42f52
testing 3
Browse files
app.py
CHANGED
@@ -53,42 +53,43 @@ def load_asr_model():
|
|
53 |
def load_llm_model():
|
54 |
model_id = "facebook/opt-1.3b"
|
55 |
|
56 |
-
# Load tokenizer
|
57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
58 |
|
59 |
# Print initial configuration
|
60 |
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
61 |
|
62 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
model = AutoModelForCausalLM.from_pretrained(
|
64 |
model_id,
|
65 |
torch_dtype=torch_dtype,
|
66 |
low_cpu_mem_usage=True
|
67 |
)
|
68 |
|
69 |
-
#
|
70 |
-
|
71 |
-
# Define a special token with ID that doesn't conflict
|
72 |
-
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
73 |
-
model.resize_token_embeddings(len(tokenizer))
|
74 |
-
|
75 |
-
# Make sure model config has consistent pad token ID
|
76 |
-
model.config.pad_token_id = tokenizer.pad_token_id
|
77 |
-
|
78 |
-
# Important: Also set these token IDs in model config
|
79 |
-
if hasattr(model.config, 'decoder_start_token_id') and model.config.decoder_start_token_id is None:
|
80 |
-
model.config.decoder_start_token_id = tokenizer.pad_token_id
|
81 |
-
|
82 |
-
print(f"Modified token IDs - PAD: {tokenizer.pad_token_id}, EOS: {tokenizer.eos_token_id}")
|
83 |
-
print(f"Model config - PAD: {model.config.pad_token_id}, EOS: {model.config.eos_token_id}")
|
84 |
|
85 |
-
#
|
86 |
-
|
87 |
-
model.config.pad_token_id = tokenizer.pad_token_id
|
88 |
|
89 |
-
#
|
|
|
|
|
|
|
|
|
90 |
model.to(device)
|
91 |
|
|
|
|
|
|
|
92 |
return model, tokenizer
|
93 |
|
94 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
@@ -257,7 +258,7 @@ def generate_response(prompt):
|
|
257 |
# Add user message to history
|
258 |
chat_history.append({"role": "user", "content": prompt})
|
259 |
|
260 |
-
#
|
261 |
full_prompt = ""
|
262 |
for message in chat_history:
|
263 |
if message["role"] == "system":
|
@@ -269,39 +270,62 @@ def generate_response(prompt):
|
|
269 |
|
270 |
full_prompt += "Assistant: "
|
271 |
|
272 |
-
#
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
-
# Create attention mask
|
277 |
-
attention_mask = torch.ones_like(input_ids)
|
278 |
|
279 |
-
#
|
280 |
-
|
281 |
-
attention_mask = attention_mask.to(device)
|
282 |
|
283 |
-
# Generate
|
284 |
with torch.no_grad():
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
|
|
|
298 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
299 |
response_text = response_text.split("Assistant: ")[-1].strip()
|
300 |
|
301 |
-
# Add assistant response to history
|
302 |
chat_history.append({"role": "assistant", "content": response_text})
|
303 |
|
304 |
-
# Keep history
|
305 |
if len(chat_history) > 10:
|
306 |
# Keep system message and last 9 exchanges
|
307 |
chat_history.pop(1)
|
|
|
53 |
def load_llm_model():
|
54 |
model_id = "facebook/opt-1.3b"
|
55 |
|
56 |
+
# Load tokenizer with special attention to the padding token
|
57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
58 |
|
59 |
# Print initial configuration
|
60 |
print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
|
61 |
|
62 |
+
# For OPT models specifically - configure tokenizer before loading model
|
63 |
+
if tokenizer.pad_token is None:
|
64 |
+
# Use a completely different token as pad token - must be done before model loading
|
65 |
+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
66 |
+
# Ensure pad token is really different from EOS token
|
67 |
+
assert tokenizer.pad_token_id != tokenizer.eos_token_id, "Pad token still same as EOS token!"
|
68 |
+
print(f"Added special PAD token with ID {tokenizer.pad_token_id} (different from EOS: {tokenizer.eos_token_id})")
|
69 |
+
|
70 |
+
# Load model with the knowledge that tokenizer may have been modified
|
71 |
model = AutoModelForCausalLM.from_pretrained(
|
72 |
model_id,
|
73 |
torch_dtype=torch_dtype,
|
74 |
low_cpu_mem_usage=True
|
75 |
)
|
76 |
|
77 |
+
# Resize embeddings to match tokenizer
|
78 |
+
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
# CRITICAL: Make sure model config knows about the pad token
|
81 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
|
|
82 |
|
83 |
+
# OPT models need this explicit configuration
|
84 |
+
if hasattr(model.config, "word_embed_proj_dim"):
|
85 |
+
model.config._remove_wrong_keys = False
|
86 |
+
|
87 |
+
# Move model to device
|
88 |
model.to(device)
|
89 |
|
90 |
+
print(f"Final token setup - Pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
|
91 |
+
print(f"Model config pad_token_id: {model.config.pad_token_id}")
|
92 |
+
|
93 |
return model, tokenizer
|
94 |
|
95 |
# Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
|
|
|
258 |
# Add user message to history
|
259 |
chat_history.append({"role": "user", "content": prompt})
|
260 |
|
261 |
+
# Build full prompt from chat history
|
262 |
full_prompt = ""
|
263 |
for message in chat_history:
|
264 |
if message["role"] == "system":
|
|
|
270 |
|
271 |
full_prompt += "Assistant: "
|
272 |
|
273 |
+
# Use encode_plus which offers more control
|
274 |
+
encoded_input = llm_tokenizer.encode_plus(
|
275 |
+
full_prompt,
|
276 |
+
return_tensors="pt",
|
277 |
+
padding=False, # Don't pad here - we'll handle it manually
|
278 |
+
add_special_tokens=True,
|
279 |
+
return_attention_mask=True
|
280 |
+
)
|
281 |
+
|
282 |
+
# Extract and move tensors to device
|
283 |
+
input_ids = encoded_input["input_ids"].to(device)
|
284 |
|
285 |
+
# Create attention mask explicitly - all 1s for a non-padded sequence
|
286 |
+
attention_mask = torch.ones_like(input_ids).to(device)
|
287 |
|
288 |
+
# Print for debugging
|
289 |
+
print(f"Input shape: {input_ids.shape}, Attention mask shape: {attention_mask.shape}")
|
|
|
290 |
|
291 |
+
# Generate with very explicit parameters for OPT models
|
292 |
with torch.no_grad():
|
293 |
+
try:
|
294 |
+
output = llm_model.generate(
|
295 |
+
input_ids=input_ids,
|
296 |
+
attention_mask=attention_mask, # Explicitly pass attention mask
|
297 |
+
max_new_tokens=128,
|
298 |
+
do_sample=True,
|
299 |
+
temperature=0.7,
|
300 |
+
top_p=0.9,
|
301 |
+
pad_token_id=llm_tokenizer.pad_token_id, # Explicitly set pad token ID
|
302 |
+
eos_token_id=llm_tokenizer.eos_token_id, # Explicitly set EOS token ID
|
303 |
+
use_cache=True,
|
304 |
+
no_repeat_ngram_size=3,
|
305 |
+
# Add these parameters specifically for OPT
|
306 |
+
forced_bos_token_id=None,
|
307 |
+
forced_eos_token_id=None,
|
308 |
+
num_beams=1 # Simple greedy decoding with temperature
|
309 |
+
)
|
310 |
+
|
311 |
+
except Exception as e:
|
312 |
+
print(f"Error during generation: {e}")
|
313 |
+
# Fallback with simpler parameters
|
314 |
+
output = llm_model.generate(
|
315 |
+
input_ids=input_ids,
|
316 |
+
max_new_tokens=128,
|
317 |
+
do_sample=True,
|
318 |
+
temperature=0.7
|
319 |
+
)
|
320 |
|
321 |
+
# Decode only the generated part (not the input)
|
322 |
response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
|
323 |
response_text = response_text.split("Assistant: ")[-1].strip()
|
324 |
|
325 |
+
# Add assistant response to history
|
326 |
chat_history.append({"role": "assistant", "content": response_text})
|
327 |
|
328 |
+
# Keep history manageable
|
329 |
if len(chat_history) > 10:
|
330 |
# Keep system message and last 9 exchanges
|
331 |
chat_history.pop(1)
|