tahm_kench / app.py
xlr8
add usage note
3c1c5fa
import spaces # must come first
import gradio as gr
import numpy as np
import torch
from generator import load_csm_1b, Segment
SAMPLE_RATE = 24000
CONTEXT_WINDOW = 3
generator = load_csm_1b(device="cuda")
def make_silence(duration_ms=300):
num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
return np.zeros(num_samples, dtype=np.float32)
def fade(audio_np, fade_duration_ms=200):
fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE)
fade_in = np.linspace(0, 1, fade_len)
fade_out = np.linspace(1, 0, fade_len)
audio_np[:fade_len] *= fade_in
audio_np[-fade_len:] *= fade_out
return audio_np
@spaces.GPU(duration=30)
def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms):
lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
all_audio = []
context = []
for line in lines:
if line == "__PAUSE__":
all_audio.append(make_silence(pause_ms))
continue
trimmed_context = context[-CONTEXT_WINDOW:]
audio = generator.generate(
text=line,
speaker=0,
context=trimmed_context,
max_audio_length_ms=8000,
temperature=temp,
topk=top_k,
top_p=top_p,
)
audio_np = audio.cpu().numpy()
audio_np = fade(audio_np, fade_duration_ms=fade_ms)
all_audio.append(audio_np)
context.append(Segment(text=line, speaker=0, audio=audio))
if not all_audio:
raise gr.Error("Please enter some text.")
full_audio = np.concatenate(all_audio)
audio_int16 = (full_audio * 32768).astype(np.int16)
return SAMPLE_RATE, audio_int16
DEFAULT_TEXT = """I hunger...
__PAUSE__
For love!"""
with gr.Blocks() as app:
gr.Markdown("""
# 🐸 Tahm Kench Voice Synth
## Usage tips
- You may need to try the generation more than once to get a good result. Like most generative AI, there is an element of randomness, controlled by the sampling parameters.
- ⚠️ **Apostrophe warning:** The model may get confused by apostrophes (`'`). Avoid them if possible.
- The in-browser play feature does not work on Safari, but you can download and play the clips locally.
- Enter one statement per line and the conversational segments will be stitched together.
- The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments.
- Use `__PAUSE__` on a line to insert a silent pause.
- We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below.
""")
with gr.Row():
input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT)
with gr.Row():
temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
with gr.Row():
fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)")
pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)")
output_audio = gr.Audio(label="Generated Audio", type="numpy")
run_button = gr.Button("🎤 Synthesize")
run_button.click(
infer,
inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms],
outputs=[output_audio],
)
app.launch()