husseinelsaadi commited on
Commit
269d410
Β·
verified Β·
1 Parent(s): b1f8948

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -1481,21 +1481,25 @@ bark_voice_preset = "v2/en_speaker_5"
1481
 
1482
  def bark_tts(text):
1483
  print(f"πŸ” Synthesizing TTS for: {text}")
1484
- processed = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
 
 
1485
  input_ids = processed["input_ids"].to(model_bark.device)
1486
  attention_mask = processed.get("attention_mask", None)
1487
  if attention_mask is not None:
1488
  attention_mask = attention_mask.to(model_bark.device)
1489
 
1490
  start = time.time()
 
1491
  speech_values = model_bark.generate(
1492
  input_ids=input_ids,
1493
  attention_mask=attention_mask,
1494
- pad_token_id=10000, # Optional safety
1495
- max_new_tokens=100 # βœ… Passed once only here
1496
  )
1497
  print(f"βœ… Bark finished in {round(time.time() - start, 2)}s")
1498
 
 
1499
  speech = speech_values.cpu().numpy().squeeze()
1500
  speech = (speech * 32767).astype(np.int16)
1501
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
@@ -1504,6 +1508,7 @@ def bark_tts(text):
1504
 
1505
 
1506
 
 
1507
  # Whisper STT
1508
  print("πŸ” Loading Whisper model...")
1509
  whisper_model = whisper.load_model("base", device="cuda")
 
1481
 
1482
  def bark_tts(text):
1483
  print(f"πŸ” Synthesizing TTS for: {text}")
1484
+ # DON'T pass voice_preset here β€” it will inject generation kwargs internally!
1485
+ processed = processor_bark(text, return_tensors="pt")
1486
+
1487
  input_ids = processed["input_ids"].to(model_bark.device)
1488
  attention_mask = processed.get("attention_mask", None)
1489
  if attention_mask is not None:
1490
  attention_mask = attention_mask.to(model_bark.device)
1491
 
1492
  start = time.time()
1493
+ # Pass ONLY these manually β€” clean, controlled
1494
  speech_values = model_bark.generate(
1495
  input_ids=input_ids,
1496
  attention_mask=attention_mask,
1497
+ pad_token_id=10000,
1498
+ max_new_tokens=100,
1499
  )
1500
  print(f"βœ… Bark finished in {round(time.time() - start, 2)}s")
1501
 
1502
+ # Post-processing
1503
  speech = speech_values.cpu().numpy().squeeze()
1504
  speech = (speech * 32767).astype(np.int16)
1505
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
 
1508
 
1509
 
1510
 
1511
+
1512
  # Whisper STT
1513
  print("πŸ” Loading Whisper model...")
1514
  whisper_model = whisper.load_model("base", device="cuda")