Commit
Β·
b986477
1
Parent(s):
ae625e2
added edge tts
Browse files- app.py +14 -19
- requirements.txt +2 -1
app.py
CHANGED
@@ -1695,7 +1695,7 @@ import numpy as np
|
|
1695 |
import scipy.io.wavfile as wavfile
|
1696 |
import os
|
1697 |
import json
|
1698 |
-
|
1699 |
import torch, gc
|
1700 |
from faster_whisper import WhisperModel
|
1701 |
import asyncio
|
@@ -1707,9 +1707,9 @@ torch.cuda.empty_cache()
|
|
1707 |
gc.collect()
|
1708 |
|
1709 |
# Global variables for lazy loading
|
1710 |
-
tts_model = None
|
1711 |
faster_whisper_model = None
|
1712 |
-
|
|
|
1713 |
|
1714 |
# Thread pool for async operations
|
1715 |
executor = ThreadPoolExecutor(max_workers=2)
|
@@ -1725,17 +1725,11 @@ else:
|
|
1725 |
|
1726 |
def load_models_lazy():
|
1727 |
"""Load models only when needed"""
|
1728 |
-
global
|
1729 |
|
1730 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
1731 |
print(f"π Using device: {device}")
|
1732 |
|
1733 |
-
if tts_model is None:
|
1734 |
-
print("π Loading Coqui TTS model...")
|
1735 |
-
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False,
|
1736 |
-
gpu=torch.cuda.is_available())
|
1737 |
-
print("β
Coqui TTS model loaded")
|
1738 |
-
|
1739 |
if faster_whisper_model is None:
|
1740 |
print("π Loading Faster-Whisper model...")
|
1741 |
compute_type = "float16" if device == "cuda" else "int8"
|
@@ -1743,15 +1737,16 @@ def load_models_lazy():
|
|
1743 |
print(f"β
Faster-Whisper model loaded on {device}")
|
1744 |
|
1745 |
|
|
|
|
|
|
|
|
|
|
|
1746 |
def tts_async(text):
|
1747 |
-
|
1748 |
-
|
1749 |
-
|
1750 |
-
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1751 |
-
tts_model.tts_to_file(text=text, file_path=temp_wav.name)
|
1752 |
-
return temp_wav.name
|
1753 |
|
1754 |
-
return executor.submit(_generate)
|
1755 |
|
1756 |
|
1757 |
|
@@ -1901,8 +1896,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
1901 |
cleaned_text = first_q.strip().replace("\n", " ")
|
1902 |
audio_future = tts_async(cleaned_text)
|
1903 |
audio_path = audio_future.result()
|
1904 |
-
print("β±οΈ
|
1905 |
-
|
1906 |
# Log question
|
1907 |
state["log"].append({
|
1908 |
"type": "question",
|
|
|
1695 |
import scipy.io.wavfile as wavfile
|
1696 |
import os
|
1697 |
import json
|
1698 |
+
import edge_tts
|
1699 |
import torch, gc
|
1700 |
from faster_whisper import WhisperModel
|
1701 |
import asyncio
|
|
|
1707 |
gc.collect()
|
1708 |
|
1709 |
# Global variables for lazy loading
|
|
|
1710 |
faster_whisper_model = None
|
1711 |
+
tts_voice = "en-US-AriaNeural"
|
1712 |
+
|
1713 |
|
1714 |
# Thread pool for async operations
|
1715 |
executor = ThreadPoolExecutor(max_workers=2)
|
|
|
1725 |
|
1726 |
def load_models_lazy():
|
1727 |
"""Load models only when needed"""
|
1728 |
+
global faster_whisper_model
|
1729 |
|
1730 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
1731 |
print(f"π Using device: {device}")
|
1732 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1733 |
if faster_whisper_model is None:
|
1734 |
print("π Loading Faster-Whisper model...")
|
1735 |
compute_type = "float16" if device == "cuda" else "int8"
|
|
|
1737 |
print(f"β
Faster-Whisper model loaded on {device}")
|
1738 |
|
1739 |
|
1740 |
+
async def edge_tts_to_file(text, output_path="tts.wav", voice=tts_voice):
|
1741 |
+
communicate = edge_tts.Communicate(text, voice)
|
1742 |
+
await communicate.save(output_path)
|
1743 |
+
return output_path
|
1744 |
+
|
1745 |
def tts_async(text):
|
1746 |
+
loop = asyncio.new_event_loop()
|
1747 |
+
asyncio.set_event_loop(loop)
|
1748 |
+
return executor.submit(loop.run_until_complete, edge_tts_to_file(text))
|
|
|
|
|
|
|
1749 |
|
|
|
1750 |
|
1751 |
|
1752 |
|
|
|
1896 |
cleaned_text = first_q.strip().replace("\n", " ")
|
1897 |
audio_future = tts_async(cleaned_text)
|
1898 |
audio_path = audio_future.result()
|
1899 |
+
print("β±οΈ TTS (edge-tts) took", round(time.perf_counter() - start, 2), "seconds")
|
1900 |
+
|
1901 |
# Log question
|
1902 |
state["log"].append({
|
1903 |
"type": "question",
|
requirements.txt
CHANGED
@@ -39,4 +39,5 @@ huggingface_hub==0.20.3
|
|
39 |
textract==1.6.3
|
40 |
bitsandbytes
|
41 |
faster-whisper==0.10.0
|
42 |
-
|
|
|
|
39 |
textract==1.6.3
|
40 |
bitsandbytes
|
41 |
faster-whisper==0.10.0
|
42 |
+
edge-tts==6.1.2
|
43 |
+
|