kokoro-mcp / kokoro_text_to_audio.py
fdaudens's picture
fdaudens HF Staff
Upload 3 files
a132885 verified
raw
history blame
2.56 kB
import gradio as gr
import torch
from transformers import AutoModelForTextToWaveform, AutoProcessor
# Load model and processor
model_name = "hexgrad/Kokoro-82M"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForTextToWaveform.from_pretrained(model_name, torch_dtype=torch.float16)
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
def text_to_audio(text, speed=1.0):
"""Convert text to audio using Kokoro model"""
# Process the input text
inputs = processor(text=text, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
# Set generation parameters
gen_kwargs = {
"do_sample": True,
"temperature": 0.7,
"length_penalty": 1.0,
"repetition_penalty": 2.0,
"top_p": 0.9,
}
# Generate waveform
with torch.no_grad():
waveform = model.generate(**inputs, **gen_kwargs).cpu().numpy()[0]
# Create a sample rate (typical for audio is 24000)
sample_rate = 24000
# Apply speed factor if needed
if speed != 1.0:
import numpy as np
import librosa
waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=speed)
return sample_rate, waveform
# Create Gradio interface
with gr.Blocks(title="Kokoro Text-to-Audio") as app:
gr.Markdown("# 🎵 Kokoro Text-to-Audio Converter")
gr.Markdown("Convert text to speech using hexgrad/Kokoro-82M model")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter your text",
placeholder="Type something to convert to audio...",
lines=5
)
speed_slider = gr.Slider(
minimum=0.5,
maximum=1.5,
value=1.0,
step=0.1,
label="Speech Speed"
)
submit_btn = gr.Button("Generate Audio")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="numpy")
submit_btn.click(
fn=text_to_audio,
inputs=[text_input, speed_slider],
outputs=[audio_output]
)
gr.Markdown("### Usage Tips")
gr.Markdown("- For best results, keep your text reasonably short")
gr.Markdown("- Adjust the speed slider to modify the pace of speech")
gr.Markdown("- The model may take a moment to load on first use")
# Launch the app
if __name__ == "__main__":
app.launch()