Spaces:

minhhungg
/

melotts-api

Running

App Files Files Community

minhhungg commited on Jul 16

Commit

146a956

verified ·

1 Parent(s): f7e06e7

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -31

app.py CHANGED Viewed

@@ -1,49 +1,51 @@
-# app.py (for MeloTTS API Space)
 import gradio as gr
-import os
 import torch
 import io
 import soundfile as sf
 import base64
 import logging
-# --- Setup ---
-# This command is crucial and needs to run once.
-# It downloads the dictionary needed for Japanese/Korean.
 os.system('python -m unidic download')
 from melo.api import TTS
-# --- Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# --- Model Loading ---
-MODEL = None
-DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 LANGUAGE = 'KR'
-# We will use 0.9 for a slightly faster than normal, clear pace.
-SPEED = 0.9
 try:
-    logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}")
-    MODEL = TTS(language=LANGUAGE, device=DEVICE)
-    SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR'
     logger.info("MeloTTS model loaded successfully.")
-    logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}")
 except Exception as e:
     logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
-    MODEL = None
-# --- Main TTS Synthesis Function ---
 def synthesize(text_to_synthesize):
-    if not MODEL:
-        raise gr.Error("TTS Model is not loaded. Cannot process request.")
     if not text_to_synthesize or not text_to_synthesize.strip():
-        # Create and return a silent audio data URI
-        silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz
         wav_buffer = io.BytesIO()
         sf.write(wav_buffer, silent_audio, 24000, format='WAV')
         wav_buffer.seek(0)
@@ -53,34 +55,43 @@ def synthesize(text_to_synthesize):
     try:
         logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
-        # Use an in-memory BytesIO object to store the audio
         wav_buffer = io.BytesIO()
-        MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav')
         # Reset buffer position to the beginning
         wav_buffer.seek(0)
-        # Read the bytes and encode to base64
         wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
         logger.info("Synthesis complete.")
         return f"data:audio/wav;base64,{wav_base64}"
     except Exception as e:
-        logger.exception(f"MeloTTS synthesis error: {e}")
         raise gr.Error(f"An error occurred during synthesis: {str(e)}")
 # --- Create and Launch the Gradio Interface ---
-# We create a pure API without a complex UI.
 iface = gr.Interface(
     fn=synthesize,
     inputs=gr.Textbox(label="Text to Synthesize"),
-    outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API
-    title="MeloTTS API for Korean",
-    description="A simplified API for MeloTTS, configured for Korean language.",
     api_name="synthesize"
 )
-# The .queue() is important for handling multiple requests on HF Spaces.
 iface.queue().launch()

+# app.py (for your new MeloTTS space)
 import gradio as gr
 import torch
 import io
+import os
+import numpy as np
 import soundfile as sf
 import base64
 import logging
+# This command is important and should run at the start
 os.system('python -m unidic download')
 from melo.api import TTS
+# --- Setup Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# --- Configuration ---
+# We pre-configure everything here.
 LANGUAGE = 'KR'
+# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
+SPEED = 0.8
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+SPEAKER_ID = 'KR' # Default Korean speaker
+# --- Load Model (this happens only once when the space starts) ---
+MODEL_INSTANCE = None
 try:
+    logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
+    MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
     logger.info("MeloTTS model loaded successfully.")
 except Exception as e:
     logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
+    MODEL_INSTANCE = None
 def synthesize(text_to_synthesize):
+    """
+    Takes text input and returns a base64 encoded WAV audio data URI string.
+    """
+    if not MODEL_INSTANCE:
+        raise gr.Error("TTS Model is not available. Cannot process request.")
     if not text_to_synthesize or not text_to_synthesize.strip():
+        # Create and return a silent audio data URI for empty input
+        silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
         wav_buffer = io.BytesIO()
         sf.write(wav_buffer, silent_audio, 24000, format='WAV')
         wav_buffer.seek(0)
     try:
         logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
+        # Use an in-memory BytesIO object to hold the audio data
         wav_buffer = io.BytesIO()
+        # Synthesize audio directly to the buffer
+        MODEL_INSTANCE.tts_to_file(
+            text_to_synthesize,
+            MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID],
+            wav_buffer,
+            speed=SPEED,
+            format='wav'
+        )
         # Reset buffer position to the beginning
         wav_buffer.seek(0)
+        # Encode the bytes to base64
         wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
         logger.info("Synthesis complete.")
+        # Return the data URI string our React app expects
         return f"data:audio/wav;base64,{wav_base64}"
     except Exception as e:
+        logger.exception(f"TTS synthesis error: {e}")
         raise gr.Error(f"An error occurred during synthesis: {str(e)}")
 # --- Create and Launch the Gradio Interface ---
+# We create a pure API with no complex UI. This is fast and reliable.
 iface = gr.Interface(
     fn=synthesize,
     inputs=gr.Textbox(label="Text to Synthesize"),
+    outputs="text", # The API will return a simple text string (our base64 URI)
+    title="MeloTTS API",
+    description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
     api_name="synthesize"
 )
+# The .queue() helps manage traffic and is recommended for public APIs.
 iface.queue().launch()