Spaces:

xlr8harder
/

tahm_kench

Sleeping

tahm_kench / app.py

xlr8

add usage note

3c1c5fa 4 months ago

3.53 kB

	import spaces # must come first
	import gradio as gr
	import numpy as np
	import torch
	from generator import load_csm_1b, Segment

	SAMPLE_RATE = 24000
	CONTEXT_WINDOW = 3

	generator = load_csm_1b(device="cuda")

	def make_silence(duration_ms=300):
	num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
	return np.zeros(num_samples, dtype=np.float32)

	def fade(audio_np, fade_duration_ms=200):
	fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE)
	fade_in = np.linspace(0, 1, fade_len)
	fade_out = np.linspace(1, 0, fade_len)
	audio_np[:fade_len] *= fade_in
	audio_np[-fade_len:] *= fade_out
	return audio_np

	@spaces.GPU(duration=30)
	def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms):
	lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
	all_audio = []
	context = []

	for line in lines:
	if line == "__PAUSE__":
	all_audio.append(make_silence(pause_ms))
	continue

	trimmed_context = context[-CONTEXT_WINDOW:]

	audio = generator.generate(
	text=line,
	speaker=0,
	context=trimmed_context,
	max_audio_length_ms=8000,
	temperature=temp,
	topk=top_k,
	top_p=top_p,
	)

	audio_np = audio.cpu().numpy()
	audio_np = fade(audio_np, fade_duration_ms=fade_ms)
	all_audio.append(audio_np)

	context.append(Segment(text=line, speaker=0, audio=audio))

	if not all_audio:
	raise gr.Error("Please enter some text.")

	full_audio = np.concatenate(all_audio)
	audio_int16 = (full_audio * 32768).astype(np.int16)
	return SAMPLE_RATE, audio_int16

	DEFAULT_TEXT = """I hunger...
	__PAUSE__
	For love!"""

	with gr.Blocks() as app:
	gr.Markdown("""
	# 🐸 Tahm Kench Voice Synth

	## Usage tips
	- You may need to try the generation more than once to get a good result. Like most generative AI, there is an element of randomness, controlled by the sampling parameters.
	- ⚠️ Apostrophe warning: The model may get confused by apostrophes (`'`). Avoid them if possible.
	- The in-browser play feature does not work on Safari, but you can download and play the clips locally.
	- Enter one statement per line and the conversational segments will be stitched together.
	- The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments.
	- Use `__PAUSE__` on a line to insert a silent pause.
	- We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below.
	""")
	with gr.Row():
	input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT)
	with gr.Row():
	temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
	top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
	top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
	with gr.Row():
	fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)")
	pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)")
	output_audio = gr.Audio(label="Generated Audio", type="numpy")
	run_button = gr.Button("🎤 Synthesize")

	run_button.click(
	infer,
	inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms],
	outputs=[output_audio],
	)

	app.launch()