husseinelsaadi commited on
Commit
b986477
Β·
1 Parent(s): ae625e2

added edge tts

Browse files
Files changed (2) hide show
  1. app.py +14 -19
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1695,7 +1695,7 @@ import numpy as np
1695
  import scipy.io.wavfile as wavfile
1696
  import os
1697
  import json
1698
- from TTS.api import TTS
1699
  import torch, gc
1700
  from faster_whisper import WhisperModel
1701
  import asyncio
@@ -1707,9 +1707,9 @@ torch.cuda.empty_cache()
1707
  gc.collect()
1708
 
1709
  # Global variables for lazy loading
1710
- tts_model = None
1711
  faster_whisper_model = None
1712
- bark_voice_preset = "v2/en_speaker_6"
 
1713
 
1714
  # Thread pool for async operations
1715
  executor = ThreadPoolExecutor(max_workers=2)
@@ -1725,17 +1725,11 @@ else:
1725
 
1726
  def load_models_lazy():
1727
  """Load models only when needed"""
1728
- global tts_model, faster_whisper_model
1729
 
1730
  device = "cuda" if torch.cuda.is_available() else "cpu"
1731
  print(f"πŸ” Using device: {device}")
1732
 
1733
- if tts_model is None:
1734
- print("πŸ” Loading Coqui TTS model...")
1735
- tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False,
1736
- gpu=torch.cuda.is_available())
1737
- print("βœ… Coqui TTS model loaded")
1738
-
1739
  if faster_whisper_model is None:
1740
  print("πŸ” Loading Faster-Whisper model...")
1741
  compute_type = "float16" if device == "cuda" else "int8"
@@ -1743,15 +1737,16 @@ def load_models_lazy():
1743
  print(f"βœ… Faster-Whisper model loaded on {device}")
1744
 
1745
 
 
 
 
 
 
1746
  def tts_async(text):
1747
- def _generate():
1748
- load_models_lazy()
1749
- print(f"πŸŽ™οΈ Speaking: {text}")
1750
- temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1751
- tts_model.tts_to_file(text=text, file_path=temp_wav.name)
1752
- return temp_wav.name
1753
 
1754
- return executor.submit(_generate)
1755
 
1756
 
1757
 
@@ -1901,8 +1896,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
1901
  cleaned_text = first_q.strip().replace("\n", " ")
1902
  audio_future = tts_async(cleaned_text)
1903
  audio_path = audio_future.result()
1904
- print("⏱️ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
1905
-
1906
  # Log question
1907
  state["log"].append({
1908
  "type": "question",
 
1695
  import scipy.io.wavfile as wavfile
1696
  import os
1697
  import json
1698
+ import edge_tts
1699
  import torch, gc
1700
  from faster_whisper import WhisperModel
1701
  import asyncio
 
1707
  gc.collect()
1708
 
1709
  # Global variables for lazy loading
 
1710
  faster_whisper_model = None
1711
+ tts_voice = "en-US-AriaNeural"
1712
+
1713
 
1714
  # Thread pool for async operations
1715
  executor = ThreadPoolExecutor(max_workers=2)
 
1725
 
1726
  def load_models_lazy():
1727
  """Load models only when needed"""
1728
+ global faster_whisper_model
1729
 
1730
  device = "cuda" if torch.cuda.is_available() else "cpu"
1731
  print(f"πŸ” Using device: {device}")
1732
 
 
 
 
 
 
 
1733
  if faster_whisper_model is None:
1734
  print("πŸ” Loading Faster-Whisper model...")
1735
  compute_type = "float16" if device == "cuda" else "int8"
 
1737
  print(f"βœ… Faster-Whisper model loaded on {device}")
1738
 
1739
 
1740
+ async def edge_tts_to_file(text, output_path="tts.wav", voice=tts_voice):
1741
+ communicate = edge_tts.Communicate(text, voice)
1742
+ await communicate.save(output_path)
1743
+ return output_path
1744
+
1745
  def tts_async(text):
1746
+ loop = asyncio.new_event_loop()
1747
+ asyncio.set_event_loop(loop)
1748
+ return executor.submit(loop.run_until_complete, edge_tts_to_file(text))
 
 
 
1749
 
 
1750
 
1751
 
1752
 
 
1896
  cleaned_text = first_q.strip().replace("\n", " ")
1897
  audio_future = tts_async(cleaned_text)
1898
  audio_path = audio_future.result()
1899
+ print("⏱️ TTS (edge-tts) took", round(time.perf_counter() - start, 2), "seconds")
1900
+
1901
  # Log question
1902
  state["log"].append({
1903
  "type": "question",
requirements.txt CHANGED
@@ -39,4 +39,5 @@ huggingface_hub==0.20.3
39
  textract==1.6.3
40
  bitsandbytes
41
  faster-whisper==0.10.0
42
- TTS==0.22.0
 
 
39
  textract==1.6.3
40
  bitsandbytes
41
  faster-whisper==0.10.0
42
+ edge-tts==6.1.2
43
+