import spaces # must come first import gradio as gr import numpy as np import torch from generator import load_csm_1b, Segment SAMPLE_RATE = 24000 CONTEXT_WINDOW = 3 generator = load_csm_1b(device="cuda") def make_silence(duration_ms=300): num_samples = int((SAMPLE_RATE * duration_ms) / 1000) return np.zeros(num_samples, dtype=np.float32) def fade(audio_np, fade_duration_ms=200): fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE) fade_in = np.linspace(0, 1, fade_len) fade_out = np.linspace(1, 0, fade_len) audio_np[:fade_len] *= fade_in audio_np[-fade_len:] *= fade_out return audio_np @spaces.GPU(duration=30) def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms): lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()] all_audio = [] context = [] for line in lines: if line == "__PAUSE__": all_audio.append(make_silence(pause_ms)) continue trimmed_context = context[-CONTEXT_WINDOW:] audio = generator.generate( text=line, speaker=0, context=trimmed_context, max_audio_length_ms=8000, temperature=temp, topk=top_k, top_p=top_p, ) audio_np = audio.cpu().numpy() audio_np = fade(audio_np, fade_duration_ms=fade_ms) all_audio.append(audio_np) context.append(Segment(text=line, speaker=0, audio=audio)) if not all_audio: raise gr.Error("Please enter some text.") full_audio = np.concatenate(all_audio) audio_int16 = (full_audio * 32768).astype(np.int16) return SAMPLE_RATE, audio_int16 DEFAULT_TEXT = """I hunger... __PAUSE__ For love!""" with gr.Blocks() as app: gr.Markdown(""" # 🐸 Tahm Kench Voice Synth ## Usage tips - You may need to try the generation more than once to get a good result. Like most generative AI, there is an element of randomness, controlled by the sampling parameters. - ⚠️ **Apostrophe warning:** The model may get confused by apostrophes (`'`). Avoid them if possible. - The in-browser play feature does not work on Safari, but you can download and play the clips locally. - Enter one statement per line and the conversational segments will be stitched together. - The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments. - Use `__PAUSE__` on a line to insert a silent pause. - We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below. """) with gr.Row(): input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT) with gr.Row(): temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature") top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K") top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P") with gr.Row(): fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)") pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)") output_audio = gr.Audio(label="Generated Audio", type="numpy") run_button = gr.Button("🎤 Synthesize") run_button.click( infer, inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms], outputs=[output_audio], ) app.launch()