husseinelsaadi commited on
Commit
ae625e2
Β·
1 Parent(s): 1a5a90b

added bark tts for faster loading

Browse files
Files changed (2) hide show
  1. app.py +14 -33
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1695,7 +1695,7 @@ import numpy as np
1695
  import scipy.io.wavfile as wavfile
1696
  import os
1697
  import json
1698
- from transformers import BarkModel, AutoProcessor
1699
  import torch, gc
1700
  from faster_whisper import WhisperModel
1701
  import asyncio
@@ -1707,8 +1707,7 @@ torch.cuda.empty_cache()
1707
  gc.collect()
1708
 
1709
  # Global variables for lazy loading
1710
- model_bark = None
1711
- processor_bark = None
1712
  faster_whisper_model = None
1713
  bark_voice_preset = "v2/en_speaker_6"
1714
 
@@ -1726,20 +1725,16 @@ else:
1726
 
1727
  def load_models_lazy():
1728
  """Load models only when needed"""
1729
- global model_bark, processor_bark, faster_whisper_model
1730
 
1731
  device = "cuda" if torch.cuda.is_available() else "cpu"
1732
  print(f"πŸ” Using device: {device}")
1733
-
1734
- if model_bark is None:
1735
- print("πŸ” Loading Bark model...")
1736
- model_bark = BarkModel.from_pretrained("suno/bark").to(device)
1737
- print(f"βœ… Bark model loaded on {device}")
1738
-
1739
- if processor_bark is None:
1740
- print("πŸ” Loading Bark processor...")
1741
- processor_bark = AutoProcessor.from_pretrained("suno/bark")
1742
- print("βœ… Bark processor loaded")
1743
 
1744
  if faster_whisper_model is None:
1745
  print("πŸ” Loading Faster-Whisper model...")
@@ -1748,32 +1743,18 @@ def load_models_lazy():
1748
  print(f"βœ… Faster-Whisper model loaded on {device}")
1749
 
1750
 
1751
- def bark_tts_async(text):
1752
- """Fully correct async TTS generation with Bark"""
1753
  def _generate():
1754
  load_models_lazy()
1755
- device = next(model_bark.parameters()).device
1756
- print(f"πŸ” Bark model on: {device}")
1757
  print(f"πŸŽ™οΈ Speaking: {text}")
1758
-
1759
- # 🧠 Prepare full input using processor (not just input_ids)
1760
- inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
1761
- inputs = {k: v.to(device) for k, v in inputs.items()}
1762
-
1763
- # βœ… Generate using unpacked args β€” this includes all required prompt tensors
1764
- with torch.no_grad():
1765
- speech_values = model_bark.generate(**inputs)
1766
-
1767
- # βœ… Convert to audio
1768
- speech = speech_values.cpu().numpy().squeeze()
1769
- speech = (speech * 32767).astype(np.int16)
1770
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1771
- wavfile.write(temp_wav.name, 22050, speech)
1772
  return temp_wav.name
1773
 
1774
  return executor.submit(_generate)
1775
 
1776
 
 
1777
  def whisper_stt(audio_path):
1778
  """STT using Faster-Whisper"""
1779
  if not audio_path or not os.path.exists(audio_path):
@@ -1918,7 +1899,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
1918
  # Generate audio with Bark (wait for it)
1919
  start = time.perf_counter()
1920
  cleaned_text = first_q.strip().replace("\n", " ")
1921
- audio_future = bark_tts_async(cleaned_text)
1922
  audio_path = audio_future.result()
1923
  print("⏱️ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
1924
 
@@ -2014,7 +1995,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
2014
  state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
2015
 
2016
  # Generate TTS asynchronously for next question too
2017
- audio_future = bark_tts_async(next_q)
2018
  # For now, we'll wait for it (you can make this async too)
2019
  audio_path = audio_future.result()
2020
 
 
1695
  import scipy.io.wavfile as wavfile
1696
  import os
1697
  import json
1698
+ from TTS.api import TTS
1699
  import torch, gc
1700
  from faster_whisper import WhisperModel
1701
  import asyncio
 
1707
  gc.collect()
1708
 
1709
  # Global variables for lazy loading
1710
+ tts_model = None
 
1711
  faster_whisper_model = None
1712
  bark_voice_preset = "v2/en_speaker_6"
1713
 
 
1725
 
1726
  def load_models_lazy():
1727
  """Load models only when needed"""
1728
+ global tts_model, faster_whisper_model
1729
 
1730
  device = "cuda" if torch.cuda.is_available() else "cpu"
1731
  print(f"πŸ” Using device: {device}")
1732
+
1733
+ if tts_model is None:
1734
+ print("πŸ” Loading Coqui TTS model...")
1735
+ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False,
1736
+ gpu=torch.cuda.is_available())
1737
+ print("βœ… Coqui TTS model loaded")
 
 
 
 
1738
 
1739
  if faster_whisper_model is None:
1740
  print("πŸ” Loading Faster-Whisper model...")
 
1743
  print(f"βœ… Faster-Whisper model loaded on {device}")
1744
 
1745
 
1746
+ def tts_async(text):
 
1747
  def _generate():
1748
  load_models_lazy()
 
 
1749
  print(f"πŸŽ™οΈ Speaking: {text}")
 
 
 
 
 
 
 
 
 
 
 
 
1750
  temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1751
+ tts_model.tts_to_file(text=text, file_path=temp_wav.name)
1752
  return temp_wav.name
1753
 
1754
  return executor.submit(_generate)
1755
 
1756
 
1757
+
1758
  def whisper_stt(audio_path):
1759
  """STT using Faster-Whisper"""
1760
  if not audio_path or not os.path.exists(audio_path):
 
1899
  # Generate audio with Bark (wait for it)
1900
  start = time.perf_counter()
1901
  cleaned_text = first_q.strip().replace("\n", " ")
1902
+ audio_future = tts_async(cleaned_text)
1903
  audio_path = audio_future.result()
1904
  print("⏱️ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
1905
 
 
1995
  state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
1996
 
1997
  # Generate TTS asynchronously for next question too
1998
+ audio_future = tts_async(next_q)
1999
  # For now, we'll wait for it (you can make this async too)
2000
  audio_path = audio_future.result()
2001
 
requirements.txt CHANGED
@@ -38,4 +38,5 @@ accelerate==0.29.3
38
  huggingface_hub==0.20.3
39
  textract==1.6.3
40
  bitsandbytes
41
- faster-whisper==0.10.0
 
 
38
  huggingface_hub==0.20.3
39
  textract==1.6.3
40
  bitsandbytes
41
+ faster-whisper==0.10.0
42
+ TTS==0.22.0