Spaces:
Sleeping
Sleeping
import spaces # must come first | |
import gradio as gr | |
import numpy as np | |
import torch | |
from generator import load_csm_1b, Segment | |
SAMPLE_RATE = 24000 | |
CONTEXT_WINDOW = 3 | |
generator = load_csm_1b(device="cuda") | |
def make_silence(duration_ms=300): | |
num_samples = int((SAMPLE_RATE * duration_ms) / 1000) | |
return np.zeros(num_samples, dtype=np.float32) | |
def fade(audio_np, fade_duration_ms=200): | |
fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE) | |
fade_in = np.linspace(0, 1, fade_len) | |
fade_out = np.linspace(1, 0, fade_len) | |
audio_np[:fade_len] *= fade_in | |
audio_np[-fade_len:] *= fade_out | |
return audio_np | |
def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms): | |
lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()] | |
all_audio = [] | |
context = [] | |
for line in lines: | |
if line == "__PAUSE__": | |
all_audio.append(make_silence(pause_ms)) | |
continue | |
trimmed_context = context[-CONTEXT_WINDOW:] | |
audio = generator.generate( | |
text=line, | |
speaker=0, | |
context=trimmed_context, | |
max_audio_length_ms=8000, | |
temperature=temp, | |
topk=top_k, | |
top_p=top_p, | |
) | |
audio_np = audio.cpu().numpy() | |
audio_np = fade(audio_np, fade_duration_ms=fade_ms) | |
all_audio.append(audio_np) | |
context.append(Segment(text=line, speaker=0, audio=audio)) | |
if not all_audio: | |
raise gr.Error("Please enter some text.") | |
full_audio = np.concatenate(all_audio) | |
audio_int16 = (full_audio * 32768).astype(np.int16) | |
return SAMPLE_RATE, audio_int16 | |
DEFAULT_TEXT = """I hunger... | |
__PAUSE__ | |
For love!""" | |
with gr.Blocks() as app: | |
gr.Markdown(""" | |
# 🐸 Tahm Kench Voice Synth | |
## Usage tips | |
- You may need to try the generation more than once to get a good result. Like most generative AI, there is an element of randomness, controlled by the sampling parameters. | |
- ⚠️ **Apostrophe warning:** The model may get confused by apostrophes (`'`). Avoid them if possible. | |
- The in-browser play feature does not work on Safari, but you can download and play the clips locally. | |
- Enter one statement per line and the conversational segments will be stitched together. | |
- The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments. | |
- Use `__PAUSE__` on a line to insert a silent pause. | |
- We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below. | |
""") | |
with gr.Row(): | |
input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT) | |
with gr.Row(): | |
temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature") | |
top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K") | |
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P") | |
with gr.Row(): | |
fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)") | |
pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)") | |
output_audio = gr.Audio(label="Generated Audio", type="numpy") | |
run_button = gr.Button("🎤 Synthesize") | |
run_button.click( | |
infer, | |
inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms], | |
outputs=[output_audio], | |
) | |
app.launch() | |