File size: 3,527 Bytes
bdb7ac2
 
 
 
 
 
 
 
 
1cb8653
bdb7ac2
 
 
 
 
 
 
 
 
 
 
 
 
0f1e2d2
c9a53fa
bdb7ac2
 
 
 
 
 
c9a53fa
bdb7ac2
 
 
 
 
 
 
 
 
 
 
d75aa56
bdb7ac2
 
 
c9a53fa
bdb7ac2
 
 
 
 
 
 
 
 
 
 
1cb8653
 
 
bdb7ac2
 
9e3c2e9
 
1cb8653
14d90b7
3c1c5fa
d75aa56
e9f0abe
14d90b7
 
c9a53fa
14d90b7
9e3c2e9
bdb7ac2
1cb8653
bdb7ac2
9e3c2e9
 
d75aa56
c9a53fa
 
 
bdb7ac2
 
 
 
 
c9a53fa
bdb7ac2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import spaces  # must come first
import gradio as gr
import numpy as np
import torch
from generator import load_csm_1b, Segment

SAMPLE_RATE = 24000
CONTEXT_WINDOW = 3

generator = load_csm_1b(device="cuda")

def make_silence(duration_ms=300):
    num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
    return np.zeros(num_samples, dtype=np.float32)

def fade(audio_np, fade_duration_ms=200):
    fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE)
    fade_in = np.linspace(0, 1, fade_len)
    fade_out = np.linspace(1, 0, fade_len)
    audio_np[:fade_len] *= fade_in
    audio_np[-fade_len:] *= fade_out
    return audio_np

@spaces.GPU(duration=30)
def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms):
    lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
    all_audio = []
    context = []

    for line in lines:
        if line == "__PAUSE__":
            all_audio.append(make_silence(pause_ms))
            continue

        trimmed_context = context[-CONTEXT_WINDOW:]

        audio = generator.generate(
            text=line,
            speaker=0,
            context=trimmed_context,
            max_audio_length_ms=8000,
            temperature=temp,
            topk=top_k,
            top_p=top_p,
        )

        audio_np = audio.cpu().numpy()
        audio_np = fade(audio_np, fade_duration_ms=fade_ms)
        all_audio.append(audio_np)

        context.append(Segment(text=line, speaker=0, audio=audio))

    if not all_audio:
        raise gr.Error("Please enter some text.")

    full_audio = np.concatenate(all_audio)
    audio_int16 = (full_audio * 32768).astype(np.int16)
    return SAMPLE_RATE, audio_int16

DEFAULT_TEXT = """I hunger...
__PAUSE__
For love!"""

with gr.Blocks() as app:
    gr.Markdown("""
# 🐸 Tahm Kench Voice Synth

## Usage tips
- You may need to try the generation more than once to get a good result.  Like most generative AI, there is an element of randomness, controlled by the sampling parameters.
- ⚠️ **Apostrophe warning:** The model may get confused by apostrophes (`'`). Avoid them if possible.
- The in-browser play feature does not work on Safari, but you can download and play the clips locally.
- Enter one statement per line and the conversational segments will be stitched together.
- The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments.
- Use `__PAUSE__` on a line to insert a silent pause.
- We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below.
""")
    with gr.Row():
        input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT)
    with gr.Row():
        temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
        top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
    with gr.Row():
        fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)")
        pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)")
    output_audio = gr.Audio(label="Generated Audio", type="numpy")
    run_button = gr.Button("🎤 Synthesize")

    run_button.click(
        infer,
        inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms],
        outputs=[output_audio],
    )

app.launch()