Spaces:
Running
on
Zero
Running
on
Zero
This works well
Browse files
app.py
CHANGED
@@ -25,8 +25,26 @@ client = InferenceClient(
|
|
25 |
token=os.getenv("HF_TOKEN"),
|
26 |
)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# -----------------------------------------------------------------------------
|
29 |
-
# Kokoro TTS
|
30 |
# -----------------------------------------------------------------------------
|
31 |
CUDA_AVAILABLE = torch.cuda.is_available()
|
32 |
|
@@ -40,29 +58,13 @@ FEMALE_VOICE = "af_heart" # [S2]
|
|
40 |
for v in (MALE_VOICE, FEMALE_VOICE):
|
41 |
kpipeline.load_voice(v)
|
42 |
|
|
|
|
|
|
|
43 |
|
44 |
audio_queue: queue.Queue[tuple[int, np.ndarray] | None] = queue.Queue()
|
45 |
stop_signal = threading.Event()
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
def generate_podcast_text(subject: str) -> str:
|
50 |
-
"""Ask the LLM for a ~5‑minute two‑host script."""
|
51 |
-
prompt = f"""Generate a podcast told by 2 hosts about {subject}.
|
52 |
-
The podcast should be an insightful discussion, with some amount of playful banter.
|
53 |
-
Separate dialog as follows using [S1] for the male host and [S2] for the female host, for instance:
|
54 |
-
[S1] Hello, how are you?
|
55 |
-
[S2] I'm good, thank you. How are you?
|
56 |
-
[S1] I'm good, thank you. (laughs)
|
57 |
-
[S2] Great.
|
58 |
-
Now go on, make 5 minutes of podcast.
|
59 |
-
"""
|
60 |
-
response = client.chat_completion(
|
61 |
-
[{"role": "user", "content": prompt}],
|
62 |
-
max_tokens=1000,
|
63 |
-
)
|
64 |
-
return response.choices[0].message.content
|
65 |
-
|
66 |
@spaces.GPU
|
67 |
def process_audio_chunks(podcast_text: str, speed: float = 1.0) -> None:
|
68 |
"""Read each line, pick voice via tag, send chunks to the queue."""
|
@@ -96,7 +98,6 @@ def process_audio_chunks(podcast_text: str, speed: float = 1.0) -> None:
|
|
96 |
audio = kmodel(ps, ref_s, speed)
|
97 |
audio_queue.put((24000, audio.numpy()))
|
98 |
audio_numpy = audio.numpy()
|
99 |
-
print("GENERATED AUDIO", audio_numpy[-100:], audio_numpy.max())
|
100 |
if first:
|
101 |
first = False
|
102 |
audio_queue.put((24000, torch.zeros(1).numpy()))
|
@@ -111,14 +112,12 @@ def stream_audio_generator(podcast_text: str):
|
|
111 |
chunk = audio_queue.get()
|
112 |
if chunk is None:
|
113 |
break
|
114 |
-
print("CHUNK", chunk, type(chunk))
|
115 |
sr, data = chunk
|
116 |
|
117 |
buf = io.BytesIO()
|
118 |
sf.write(buf, data, sr, format="wav")
|
119 |
buf.seek(0)
|
120 |
-
yield buf.getvalue()
|
121 |
-
|
122 |
|
123 |
|
124 |
def stop_generation():
|
@@ -155,7 +154,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
155 |
|
156 |
generate_btn.click(fn=generate_podcast, outputs=podcast_output)
|
157 |
|
158 |
-
start_audio_btn.click(fn=stream_audio_generator, inputs=podcast_output, outputs=audio_output)
|
159 |
stop_btn.click(fn=stop_generation, outputs=status_text)
|
160 |
|
161 |
if __name__ == "__main__":
|
|
|
25 |
token=os.getenv("HF_TOKEN"),
|
26 |
)
|
27 |
|
28 |
+
|
29 |
+
def generate_podcast_text(subject: str) -> str:
|
30 |
+
"""Ask the LLM for a script of a podcast given by two hosts."""
|
31 |
+
prompt = f"""Generate the script of "Open Paper review", a podcast told by 2 hosts about {subject}.
|
32 |
+
The podcast should be an insightful discussion, with some amount of playful banter.
|
33 |
+
Separate dialog as follows, using [S1] for the male host and [S2] for the female host, for instance:
|
34 |
+
[S1] Hello, how are you?
|
35 |
+
[S2] I'm good, thank you. How are you?
|
36 |
+
[S1] I'm good, thank you.
|
37 |
+
[S2] Great.
|
38 |
+
Now go on, make 5 minutes of podcast.
|
39 |
+
"""
|
40 |
+
response = client.chat_completion(
|
41 |
+
[{"role": "user", "content": prompt}],
|
42 |
+
max_tokens=8156,
|
43 |
+
)
|
44 |
+
return response.choices[0].message.content
|
45 |
+
|
46 |
# -----------------------------------------------------------------------------
|
47 |
+
# Kokoro TTS
|
48 |
# -----------------------------------------------------------------------------
|
49 |
CUDA_AVAILABLE = torch.cuda.is_available()
|
50 |
|
|
|
58 |
for v in (MALE_VOICE, FEMALE_VOICE):
|
59 |
kpipeline.load_voice(v)
|
60 |
|
61 |
+
# -----------------------------------------------------------------------------
|
62 |
+
# Audio generation system with queue
|
63 |
+
# -----------------------------------------------------------------------------
|
64 |
|
65 |
audio_queue: queue.Queue[tuple[int, np.ndarray] | None] = queue.Queue()
|
66 |
stop_signal = threading.Event()
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
@spaces.GPU
|
69 |
def process_audio_chunks(podcast_text: str, speed: float = 1.0) -> None:
|
70 |
"""Read each line, pick voice via tag, send chunks to the queue."""
|
|
|
98 |
audio = kmodel(ps, ref_s, speed)
|
99 |
audio_queue.put((24000, audio.numpy()))
|
100 |
audio_numpy = audio.numpy()
|
|
|
101 |
if first:
|
102 |
first = False
|
103 |
audio_queue.put((24000, torch.zeros(1).numpy()))
|
|
|
112 |
chunk = audio_queue.get()
|
113 |
if chunk is None:
|
114 |
break
|
|
|
115 |
sr, data = chunk
|
116 |
|
117 |
buf = io.BytesIO()
|
118 |
sf.write(buf, data, sr, format="wav")
|
119 |
buf.seek(0)
|
120 |
+
yield buf.getvalue(), "Generating podcast..."
|
|
|
121 |
|
122 |
|
123 |
def stop_generation():
|
|
|
154 |
|
155 |
generate_btn.click(fn=generate_podcast, outputs=podcast_output)
|
156 |
|
157 |
+
start_audio_btn.click(fn=stream_audio_generator, inputs=podcast_output, outputs=[audio_output, status_text])
|
158 |
stop_btn.click(fn=stop_generation, outputs=status_text)
|
159 |
|
160 |
if __name__ == "__main__":
|