Spaces:

xlr8harder
/

tahm_kench

Sleeping

App Files Files Community

xlr8 commited on Apr 25

Commit

1cb8653

1 Parent(s): 1278f47

update default text

Browse files

Files changed (1) hide show

app.py +12 -6

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ CONTEXT_WINDOW = 3
 FADE_MS = 200
 PAUSE_MS = 300
-generator = load_csm_1b(device="cuda") # XXX FIXME
 def make_silence(duration_ms=300):
     num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
@@ -25,7 +25,7 @@ def fade(audio_np, fade_duration_ms=200):
     audio_np[-fade_len:] *= fade_out
     return audio_np
-@spaces.GPU(duration=90)
 def infer(input_text, temp, top_k, top_p):
     lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
     all_audio = []
@@ -45,7 +45,7 @@ def infer(input_text, temp, top_k, top_p):
             max_audio_length_ms=8000,
             temperature=temp,
             topk=top_k,
-            #top_p=top_p,
         )
         audio_np = audio.cpu().numpy()
@@ -61,20 +61,26 @@ def infer(input_text, temp, top_k, top_p):
     audio_int16 = (full_audio * 32768).astype(np.int16)
     return SAMPLE_RATE, audio_int16
 with gr.Blocks() as app:
     gr.Markdown("""
 # 🐸 Tahm Kench Voice Synth
 Enter lines of dialogue for Tahm Kench.
-- Use `__PAUSE__` on a line to insert 300ms silence.
 - Use shorter sentences and conservative sampling parameters to avoid hallucinations and degenerate output.
 """)
     with gr.Row():
-        input_text = gr.TextArea(lines=10, label="Input (multi-line)")
     with gr.Row():
         temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
         top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
-        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
     output_audio = gr.Audio(label="Generated Audio", type="numpy")
     run_button = gr.Button("🎤 Synthesize")

 FADE_MS = 200
 PAUSE_MS = 300
+generator = load_csm_1b(device="cuda")
 def make_silence(duration_ms=300):
     num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
     audio_np[-fade_len:] *= fade_out
     return audio_np
+@spaces.GPU(duration=180)
 def infer(input_text, temp, top_k, top_p):
     lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
     all_audio = []
             max_audio_length_ms=8000,
             temperature=temp,
             topk=top_k,
+            # top_p is not currently used
         )
         audio_np = audio.cpu().numpy()
     audio_int16 = (full_audio * 32768).astype(np.int16)
     return SAMPLE_RATE, audio_int16
+# Default script with dramatic flair
+DEFAULT_TEXT = """I hunger...
+__PAUSE__
+For love!"""
 with gr.Blocks() as app:
     gr.Markdown("""
 # 🐸 Tahm Kench Voice Synth
 Enter lines of dialogue for Tahm Kench.
+- ⚠️ **Note:** This model may become confused by apostrophes (`'`) — avoid them if possible.
+- Use `__PAUSE__` on a line to insert a 300ms silent break.
 - Use shorter sentences and conservative sampling parameters to avoid hallucinations and degenerate output.
 """)
     with gr.Row():
+        input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT)
     with gr.Row():
         temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
         top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
+        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P (not used)")
     output_audio = gr.Audio(label="Generated Audio", type="numpy")
     run_button = gr.Button("🎤 Synthesize")