minhhungg commited on
Commit
146a956
·
verified ·
1 Parent(s): f7e06e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -31
app.py CHANGED
@@ -1,49 +1,51 @@
1
- # app.py (for MeloTTS API Space)
2
 
3
  import gradio as gr
4
- import os
5
  import torch
6
  import io
 
 
7
  import soundfile as sf
8
  import base64
9
  import logging
10
 
11
- # --- Setup ---
12
- # This command is crucial and needs to run once.
13
- # It downloads the dictionary needed for Japanese/Korean.
14
  os.system('python -m unidic download')
15
 
16
  from melo.api import TTS
17
 
18
- # --- Logging ---
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
  logger = logging.getLogger(__name__)
21
 
22
- # --- Model Loading ---
23
- MODEL = None
24
- DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
25
  LANGUAGE = 'KR'
26
- # We will use 0.9 for a slightly faster than normal, clear pace.
27
- SPEED = 0.9
 
 
28
 
 
 
29
  try:
30
- logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}")
31
- MODEL = TTS(language=LANGUAGE, device=DEVICE)
32
- SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR'
33
  logger.info("MeloTTS model loaded successfully.")
34
- logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}")
35
  except Exception as e:
36
  logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
37
- MODEL = None
38
 
39
- # --- Main TTS Synthesis Function ---
40
  def synthesize(text_to_synthesize):
41
- if not MODEL:
42
- raise gr.Error("TTS Model is not loaded. Cannot process request.")
 
 
 
43
 
44
  if not text_to_synthesize or not text_to_synthesize.strip():
45
- # Create and return a silent audio data URI
46
- silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz
47
  wav_buffer = io.BytesIO()
48
  sf.write(wav_buffer, silent_audio, 24000, format='WAV')
49
  wav_buffer.seek(0)
@@ -53,34 +55,43 @@ def synthesize(text_to_synthesize):
53
  try:
54
  logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
55
 
56
- # Use an in-memory BytesIO object to store the audio
57
  wav_buffer = io.BytesIO()
58
- MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav')
 
 
 
 
 
 
 
 
59
 
60
  # Reset buffer position to the beginning
61
  wav_buffer.seek(0)
62
 
63
- # Read the bytes and encode to base64
64
  wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
65
 
66
  logger.info("Synthesis complete.")
 
 
67
  return f"data:audio/wav;base64,{wav_base64}"
68
 
69
  except Exception as e:
70
- logger.exception(f"MeloTTS synthesis error: {e}")
71
  raise gr.Error(f"An error occurred during synthesis: {str(e)}")
72
 
73
-
74
  # --- Create and Launch the Gradio Interface ---
75
- # We create a pure API without a complex UI.
76
  iface = gr.Interface(
77
  fn=synthesize,
78
  inputs=gr.Textbox(label="Text to Synthesize"),
79
- outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API
80
- title="MeloTTS API for Korean",
81
- description="A simplified API for MeloTTS, configured for Korean language.",
82
  api_name="synthesize"
83
  )
84
 
85
- # The .queue() is important for handling multiple requests on HF Spaces.
86
  iface.queue().launch()
 
1
+ # app.py (for your new MeloTTS space)
2
 
3
  import gradio as gr
 
4
  import torch
5
  import io
6
+ import os
7
+ import numpy as np
8
  import soundfile as sf
9
  import base64
10
  import logging
11
 
12
+ # This command is important and should run at the start
 
 
13
  os.system('python -m unidic download')
14
 
15
  from melo.api import TTS
16
 
17
+ # --- Setup Logging ---
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
+ # --- Configuration ---
22
+ # We pre-configure everything here.
 
23
  LANGUAGE = 'KR'
24
+ # NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
25
+ SPEED = 0.8
26
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
27
+ SPEAKER_ID = 'KR' # Default Korean speaker
28
 
29
+ # --- Load Model (this happens only once when the space starts) ---
30
+ MODEL_INSTANCE = None
31
  try:
32
+ logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
33
+ MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
 
34
  logger.info("MeloTTS model loaded successfully.")
 
35
  except Exception as e:
36
  logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
37
+ MODEL_INSTANCE = None
38
 
 
39
  def synthesize(text_to_synthesize):
40
+ """
41
+ Takes text input and returns a base64 encoded WAV audio data URI string.
42
+ """
43
+ if not MODEL_INSTANCE:
44
+ raise gr.Error("TTS Model is not available. Cannot process request.")
45
 
46
  if not text_to_synthesize or not text_to_synthesize.strip():
47
+ # Create and return a silent audio data URI for empty input
48
+ silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
49
  wav_buffer = io.BytesIO()
50
  sf.write(wav_buffer, silent_audio, 24000, format='WAV')
51
  wav_buffer.seek(0)
 
55
  try:
56
  logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
57
 
58
+ # Use an in-memory BytesIO object to hold the audio data
59
  wav_buffer = io.BytesIO()
60
+
61
+ # Synthesize audio directly to the buffer
62
+ MODEL_INSTANCE.tts_to_file(
63
+ text_to_synthesize,
64
+ MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID],
65
+ wav_buffer,
66
+ speed=SPEED,
67
+ format='wav'
68
+ )
69
 
70
  # Reset buffer position to the beginning
71
  wav_buffer.seek(0)
72
 
73
+ # Encode the bytes to base64
74
  wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
75
 
76
  logger.info("Synthesis complete.")
77
+
78
+ # Return the data URI string our React app expects
79
  return f"data:audio/wav;base64,{wav_base64}"
80
 
81
  except Exception as e:
82
+ logger.exception(f"TTS synthesis error: {e}")
83
  raise gr.Error(f"An error occurred during synthesis: {str(e)}")
84
 
 
85
  # --- Create and Launch the Gradio Interface ---
86
+ # We create a pure API with no complex UI. This is fast and reliable.
87
  iface = gr.Interface(
88
  fn=synthesize,
89
  inputs=gr.Textbox(label="Text to Synthesize"),
90
+ outputs="text", # The API will return a simple text string (our base64 URI)
91
+ title="MeloTTS API",
92
+ description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
93
  api_name="synthesize"
94
  )
95
 
96
+ # The .queue() helps manage traffic and is recommended for public APIs.
97
  iface.queue().launch()