Spaces:
Sleeping
Sleeping
File size: 3,527 Bytes
bdb7ac2 1cb8653 bdb7ac2 0f1e2d2 c9a53fa bdb7ac2 c9a53fa bdb7ac2 d75aa56 bdb7ac2 c9a53fa bdb7ac2 1cb8653 bdb7ac2 9e3c2e9 1cb8653 14d90b7 3c1c5fa d75aa56 e9f0abe 14d90b7 c9a53fa 14d90b7 9e3c2e9 bdb7ac2 1cb8653 bdb7ac2 9e3c2e9 d75aa56 c9a53fa bdb7ac2 c9a53fa bdb7ac2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import spaces # must come first
import gradio as gr
import numpy as np
import torch
from generator import load_csm_1b, Segment
SAMPLE_RATE = 24000
CONTEXT_WINDOW = 3
generator = load_csm_1b(device="cuda")
def make_silence(duration_ms=300):
num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
return np.zeros(num_samples, dtype=np.float32)
def fade(audio_np, fade_duration_ms=200):
fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE)
fade_in = np.linspace(0, 1, fade_len)
fade_out = np.linspace(1, 0, fade_len)
audio_np[:fade_len] *= fade_in
audio_np[-fade_len:] *= fade_out
return audio_np
@spaces.GPU(duration=30)
def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms):
lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
all_audio = []
context = []
for line in lines:
if line == "__PAUSE__":
all_audio.append(make_silence(pause_ms))
continue
trimmed_context = context[-CONTEXT_WINDOW:]
audio = generator.generate(
text=line,
speaker=0,
context=trimmed_context,
max_audio_length_ms=8000,
temperature=temp,
topk=top_k,
top_p=top_p,
)
audio_np = audio.cpu().numpy()
audio_np = fade(audio_np, fade_duration_ms=fade_ms)
all_audio.append(audio_np)
context.append(Segment(text=line, speaker=0, audio=audio))
if not all_audio:
raise gr.Error("Please enter some text.")
full_audio = np.concatenate(all_audio)
audio_int16 = (full_audio * 32768).astype(np.int16)
return SAMPLE_RATE, audio_int16
DEFAULT_TEXT = """I hunger...
__PAUSE__
For love!"""
with gr.Blocks() as app:
gr.Markdown("""
# 🐸 Tahm Kench Voice Synth
## Usage tips
- You may need to try the generation more than once to get a good result. Like most generative AI, there is an element of randomness, controlled by the sampling parameters.
- ⚠️ **Apostrophe warning:** The model may get confused by apostrophes (`'`). Avoid them if possible.
- The in-browser play feature does not work on Safari, but you can download and play the clips locally.
- Enter one statement per line and the conversational segments will be stitched together.
- The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments.
- Use `__PAUSE__` on a line to insert a silent pause.
- We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below.
""")
with gr.Row():
input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT)
with gr.Row():
temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
with gr.Row():
fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)")
pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)")
output_audio = gr.Audio(label="Generated Audio", type="numpy")
run_button = gr.Button("🎤 Synthesize")
run_button.click(
infer,
inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms],
outputs=[output_audio],
)
app.launch()
|