2cent-tts / app.py
taylorchu's picture
8080
2a32af5
import gradio as gr
import requests
import time
import subprocess
import threading
import wave
import os
import uuid
import tempfile
import socket
# Start the TTS server in the background
def start_tts_server():
subprocess.Popen(["tts-http-server"])
# Start the server in a separate thread
threading.Thread(target=start_tts_server, daemon=True).start()
# Wait for the server port to be open
def wait_for_server(timeout=30):
port = 8080 # The port your TTS server runs on
start_time = time.time()
while time.time() - start_time < timeout:
try:
# Try to connect to the server port
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(1)
result = sock.connect_ex(("localhost", port))
sock.close()
if result == 0: # Port is open
return True
except:
pass
time.sleep(1)
return False
# Function to convert text to speech
def text_to_speech(text):
try:
response = requests.post(
"http://localhost:8080/v1/audio/speech",
headers={"Content-Type": "application/json"},
json={
"model": "tts-1",
"input": text,
"voice": "alloy",
},
)
if response.status_code == 200:
# Create a unique filename for each request
unique_id = uuid.uuid4().hex
temp_dir = tempfile.gettempdir()
audio_file = os.path.join(temp_dir, f"tts_output_{unique_id}.wav")
# Convert PCM data to WAV file
pcm_data = response.content
with wave.open(audio_file, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit (2 bytes)
wav_file.setframerate(24000) # 24kHz (common for TTS)
wav_file.writeframes(pcm_data)
return audio_file
else:
return f"Error: {response.status_code} - {response.text}"
except Exception as e:
return f"Error: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(
placeholder="Enter text to convert to speech...",
label="Text",
lines=10,
),
outputs=gr.Audio(type="filepath"),
title="2cent TTS",
description="Convert text to speech using an OpenAI TTS compatible server.",
# Add automatic file cleanup after Gradio is done with the files
examples=[
["Hello, welcome to this text-to-speech demo. How does my voice sound today?"],
[
"Text-to-speech technology has evolved rapidly! It now handles inflections, pauses, and emotions. Can you believe how natural it sounds?"
],
[
"Have you ever wondered what it would be like to hear your writing read back to you with perfect pronunciation?"
],
[
"This neural network uses a 12-layer transformer architecture with 768-dimensional embeddings and 12 attention heads."
],
[
"The temperature today is 72.5 degrees Fahrenheit, with a 30% chance of rain and winds at 15 mph."
],
[
"I'm absolutely thrilled to announce that our team has finally achieved the breakthrough we've been working toward!"
],
[
'"Do you think we should knock first?" she asked.',
],
[
"The xylophonist performed a captivating rendition of Tchaikovsky's Symphony No. 6 in B minor."
],
[
"In 1969, Apollo 11 landed on the moon. The spacecraft carried 3 astronauts and traveled approximately 240,000 miles from Earth."
],
],
cache_examples=False, # Don't cache, generate new audio each time
)
# Wait for server to be ready before launching the interface
print("Waiting for TTS server to start...")
if wait_for_server():
print("TTS server is ready. Starting Gradio interface.")
else:
print(
"Warning: TTS server did not start in the expected time. The interface may not work correctly."
)
# Launch the app
if __name__ == "__main__":
# Add concurrency_count if you want to limit concurrent processing
demo.launch(server_name="0.0.0.0", server_port=7860, max_threads=40)