Michael Hu commited on
Commit
9740afc
·
1 Parent(s): 77b7581

update tts

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. pyproject.toml +1 -1
  3. utils/tts.py +83 -67
app.py CHANGED
@@ -21,7 +21,7 @@ import time
21
  import subprocess
22
  from utils.stt import transcribe_audio
23
  from utils.translation import translate_text
24
- from utils.tts_dummy import generate_speech
25
 
26
  # Hugging Face Spaces Setup Automation
27
  def setup_huggingface_space():
@@ -112,7 +112,7 @@ def handle_file_processing(upload_path):
112
  logger.info("Beginning TTS generation")
113
  status_text.markdown("🎵 **Generating Chinese Speech...**")
114
  with st.spinner("Initializing TTS engine..."):
115
- output_path = generate_speech(chinese_text, language="zh")
116
  progress_bar.progress(100)
117
  logger.info(f"TTS completed. Output file: {output_path}")
118
 
 
21
  import subprocess
22
  from utils.stt import transcribe_audio
23
  from utils.translation import translate_text
24
+ from utils.tts import generate_speech
25
 
26
  # Hugging Face Spaces Setup Automation
27
  def setup_huggingface_space():
 
112
  logger.info("Beginning TTS generation")
113
  status_text.markdown("🎵 **Generating Chinese Speech...**")
114
  with st.spinner("Initializing TTS engine..."):
115
+ output_path = generate_speech(chinese_text, voice="zf_xiaobei")
116
  progress_bar.progress(100)
117
  logger.info(f"TTS completed. Output file: {output_path}")
118
 
pyproject.toml CHANGED
@@ -14,12 +14,12 @@ python = "^3.9"
14
 
15
  # Core application dependencies
16
  streamlit = ">=1.31,<2.0"
17
- pydub = ">=0.25"
18
  python-dotenv = ">=1.0"
19
  nltk = ">=3.8"
20
  librosa = ">=0.10"
21
  soundfile = ">=0.12"
22
  ffmpeg-python = ">=0.2"
 
23
 
24
  # Machine learning dependencies
25
  #torch = [
 
14
 
15
  # Core application dependencies
16
  streamlit = ">=1.31,<2.0"
 
17
  python-dotenv = ">=1.0"
18
  nltk = ">=3.8"
19
  librosa = ">=0.10"
20
  soundfile = ">=0.12"
21
  ffmpeg-python = ">=0.2"
22
+ kokoro = ">=0.7.9"
23
 
24
  # Machine learning dependencies
25
  #torch = [
utils/tts.py CHANGED
@@ -1,78 +1,50 @@
1
  import os
2
- import torch
3
- import time
4
  import logging
5
- from pydub import AudioSegment
6
- from phonemizer.backend.espeak.wrapper import EspeakWrapper
7
- from models import build_model
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
- # Hugging Face Spaces setup
12
- MODEL_DIR = "./kokoro"
13
- os.makedirs(MODEL_DIR, exist_ok=True)
14
-
15
- # Configure espeak-ng for Hugging Face environment
16
- EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')
17
-
18
  class TTSEngine:
19
- def __init__(self):
 
 
 
 
 
 
20
  logger.info("Initializing TTS Engine")
21
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
22
- logger.info(f"Using device: {self.device}")
23
- self._verify_model_files()
24
- logger.info("Loading Kokoro model")
25
- self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
26
- logger.info("Loading voice model")
27
- self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
28
- map_location=self.device)
29
  logger.info("TTS engine initialized")
30
 
31
- def _verify_model_files(self):
32
- """Ensure required model files exist"""
33
- required_files = [
34
- f"{MODEL_DIR}/kokoro-v0_19.pth",
35
- f"{MODEL_DIR}/voices/af_bella.pt"
36
- ]
37
 
38
- missing = [f for f in required_files if not os.path.exists(f)]
39
- if missing:
40
- logger.error(f"Missing model files: {missing}")
41
- raise FileNotFoundError(
42
- f"Missing model files: {missing}\n"
43
- "Add this to your Hugging Face Space settings:\n"
44
- "App setup -> Clone Kokoro repository: "
45
- "git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro"
46
- )
47
-
48
- def generate_speech(self, text: str, language: str = "zh") -> str:
49
- """Generate speech from Chinese text"""
50
  logger.info(f"Generating speech for text length: {len(text)}")
51
 
52
  try:
53
- from kokoro import generate_full
54
-
55
- if len(text) > 500:
56
- logger.warning(f"Truncating long text ({len(text)} characters)")
57
- text = text[:495] + "[TRUNCATED]"
58
-
59
- logger.info("Starting audio generation")
60
- audio, _ = generate_full(
61
- self.model,
62
- text,
63
- self.voice,
64
- lang='en-us',
65
- max_len=200 if self.device == "cpu" else 500
66
- )
67
 
 
68
  output_path = f"temp/outputs/output_{int(time.time())}.wav"
69
- logger.info(f"Saving audio to {output_path}")
70
- AudioSegment(
71
- audio.numpy().tobytes(),
72
- frame_rate=24000,
73
- sample_width=2,
74
- channels=1
75
- ).export(output_path, format="wav")
 
76
 
77
  logger.info(f"Audio generation complete: {output_path}")
78
  return output_path
@@ -81,11 +53,55 @@ class TTSEngine:
81
  logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
82
  raise
83
 
84
- # Initialize TTS engine once
85
- @st.cache_resource
86
- def get_tts_engine():
87
- return TTSEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- def generate_speech(text: str, language: str = "zh") -> str:
90
- """Public interface for TTS generation"""
91
- return get_tts_engine().generate_speech(text, language)
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
2
  import logging
3
+ import time
4
+ import soundfile as sf
5
+ from kokoro import KPipeline
6
 
7
  logger = logging.getLogger(__name__)
8
 
 
 
 
 
 
 
 
9
  class TTSEngine:
10
+ def __init__(self, lang_code='z'):
11
+ """Initialize TTS Engine with Kokoro
12
+
13
+ Args:
14
+ lang_code (str): Language code ('a' for US English, 'b' for British English,
15
+ 'j' for Japanese, 'z' for Mandarin Chinese)
16
+ """
17
  logger.info("Initializing TTS Engine")
18
+ self.pipeline = KPipeline(lang_code=lang_code)
 
 
 
 
 
 
 
19
  logger.info("TTS engine initialized")
20
 
21
+ def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
22
+ """Generate speech from text using Kokoro
 
 
 
 
23
 
24
+ Args:
25
+ text (str): Input text to synthesize
26
+ voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
27
+ speed (float): Speech speed multiplier (0.5 to 2.0)
28
+
29
+ Returns:
30
+ str: Path to the generated audio file
31
+ """
 
 
 
 
32
  logger.info(f"Generating speech for text length: {len(text)}")
33
 
34
  try:
35
+ # Create output directory if it doesn't exist
36
+ os.makedirs("temp/outputs", exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Generate unique output path
39
  output_path = f"temp/outputs/output_{int(time.time())}.wav"
40
+
41
+ # Get the first generated segment
42
+ # We only take the first segment since the original code handled single segments
43
+ generator = self.pipeline(text, voice=voice, speed=speed)
44
+ for _, _, audio in generator:
45
+ logger.info(f"Saving audio to {output_path}")
46
+ sf.write(output_path, audio, 24000)
47
+ break
48
 
49
  logger.info(f"Audio generation complete: {output_path}")
50
  return output_path
 
53
  logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
54
  raise
55
 
56
+ def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0):
57
+ """Generate speech from text and yield each segment
58
+
59
+ Args:
60
+ text (str): Input text to synthesize
61
+ voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
62
+ speed (float): Speech speed multiplier (0.5 to 2.0)
63
+
64
+ Yields:
65
+ tuple: (sample_rate, audio_data) pairs for each segment
66
+ """
67
+ try:
68
+ generator = self.pipeline(text, voice=voice, speed=speed)
69
+ for _, _, audio in generator:
70
+ yield 24000, audio
71
+
72
+ except Exception as e:
73
+ logger.error(f"TTS streaming failed: {str(e)}", exc_info=True)
74
+ raise
75
+
76
+ # Initialize TTS engine with cache decorator if using Streamlit
77
+ def get_tts_engine(lang_code='a'):
78
+ """Get or create TTS engine instance
79
+
80
+ Args:
81
+ lang_code (str): Language code for the pipeline
82
+
83
+ Returns:
84
+ TTSEngine: Initialized TTS engine instance
85
+ """
86
+ try:
87
+ import streamlit as st
88
+ @st.cache_resource
89
+ def _get_engine():
90
+ return TTSEngine(lang_code)
91
+ return _get_engine()
92
+ except ImportError:
93
+ return TTSEngine(lang_code)
94
 
95
+ def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
96
+ """Public interface for TTS generation
97
+
98
+ Args:
99
+ text (str): Input text to synthesize
100
+ voice (str): Voice ID to use
101
+ speed (float): Speech speed multiplier
102
+
103
+ Returns:
104
+ str: Path to generated audio file
105
+ """
106
+ engine = get_tts_engine()
107
+ return engine.generate_speech(text, voice, speed)