|
import gradio as gr |
|
import requests |
|
import time |
|
import subprocess |
|
import threading |
|
import wave |
|
import os |
|
import uuid |
|
import tempfile |
|
import socket |
|
|
|
|
|
|
|
def start_tts_server(): |
|
subprocess.Popen(["tts-http-server"]) |
|
|
|
|
|
|
|
threading.Thread(target=start_tts_server, daemon=True).start() |
|
|
|
|
|
|
|
def wait_for_server(timeout=30): |
|
port = 8080 |
|
start_time = time.time() |
|
while time.time() - start_time < timeout: |
|
try: |
|
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
|
sock.settimeout(1) |
|
result = sock.connect_ex(("localhost", port)) |
|
sock.close() |
|
if result == 0: |
|
return True |
|
except: |
|
pass |
|
time.sleep(1) |
|
return False |
|
|
|
|
|
|
|
def text_to_speech(text): |
|
try: |
|
response = requests.post( |
|
"http://localhost:8080/v1/audio/speech", |
|
headers={"Content-Type": "application/json"}, |
|
json={ |
|
"model": "tts-1", |
|
"input": text, |
|
"voice": "alloy", |
|
}, |
|
) |
|
|
|
if response.status_code == 200: |
|
|
|
unique_id = uuid.uuid4().hex |
|
temp_dir = tempfile.gettempdir() |
|
audio_file = os.path.join(temp_dir, f"tts_output_{unique_id}.wav") |
|
|
|
|
|
pcm_data = response.content |
|
|
|
with wave.open(audio_file, "wb") as wav_file: |
|
wav_file.setnchannels(1) |
|
wav_file.setsampwidth(2) |
|
wav_file.setframerate(24000) |
|
wav_file.writeframes(pcm_data) |
|
|
|
return audio_file |
|
else: |
|
return f"Error: {response.status_code} - {response.text}" |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn=text_to_speech, |
|
inputs=gr.Textbox( |
|
placeholder="Enter text to convert to speech...", |
|
label="Text", |
|
lines=10, |
|
), |
|
outputs=gr.Audio(type="filepath"), |
|
title="2cent TTS", |
|
description="Convert text to speech using an OpenAI TTS compatible server.", |
|
|
|
examples=[ |
|
["Hello, welcome to this text-to-speech demo. How does my voice sound today?"], |
|
[ |
|
"Text-to-speech technology has evolved rapidly! It now handles inflections, pauses, and emotions. Can you believe how natural it sounds?" |
|
], |
|
[ |
|
"Have you ever wondered what it would be like to hear your writing read back to you with perfect pronunciation?" |
|
], |
|
[ |
|
"This neural network uses a 12-layer transformer architecture with 768-dimensional embeddings and 12 attention heads." |
|
], |
|
[ |
|
"The temperature today is 72.5 degrees Fahrenheit, with a 30% chance of rain and winds at 15 mph." |
|
], |
|
[ |
|
"I'm absolutely thrilled to announce that our team has finally achieved the breakthrough we've been working toward!" |
|
], |
|
[ |
|
'"Do you think we should knock first?" she asked.', |
|
], |
|
[ |
|
"The xylophonist performed a captivating rendition of Tchaikovsky's Symphony No. 6 in B minor." |
|
], |
|
[ |
|
"In 1969, Apollo 11 landed on the moon. The spacecraft carried 3 astronauts and traveled approximately 240,000 miles from Earth." |
|
], |
|
], |
|
cache_examples=False, |
|
) |
|
|
|
|
|
print("Waiting for TTS server to start...") |
|
if wait_for_server(): |
|
print("TTS server is ready. Starting Gradio interface.") |
|
else: |
|
print( |
|
"Warning: TTS server did not start in the expected time. The interface may not work correctly." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, max_threads=40) |
|
|