allow disable intro music, refine prompt
Browse files- front/src/components/PodcastGenerator.tsx +22 -5
- front/src/utils/prompts.ts +64 -14
- front/src/utils/utils.ts +38 -1
- index.html +178 -22
front/src/components/PodcastGenerator.tsx
CHANGED
@@ -4,6 +4,7 @@ import { Podcast, PodcastTurn } from '../utils/types';
|
|
4 |
import { parse } from 'yaml';
|
5 |
import {
|
6 |
addNoise,
|
|
|
7 |
generateAudio,
|
8 |
joinAudio,
|
9 |
loadWavAndDecode,
|
@@ -89,6 +90,7 @@ export const PodcastGenerator = ({
|
|
89 |
const [speaker1, setSpeaker1] = useState<string>('');
|
90 |
const [speaker2, setSpeaker2] = useState<string>('');
|
91 |
const [speed, setSpeed] = useState<string>('1.2');
|
|
|
92 |
|
93 |
const setRandSpeaker = () => {
|
94 |
const { s1, s2 } = getRandomSpeakerPair();
|
@@ -135,14 +137,18 @@ export const PodcastGenerator = ({
|
|
135 |
step.audioBuffer = await loadWavAndDecode(url);
|
136 |
if (i === 0) {
|
137 |
outputWav = step.audioBuffer;
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
140 |
} else {
|
141 |
const lastStep = steps[i - 1];
|
142 |
outputWav = joinAudio(
|
143 |
outputWav!,
|
144 |
step.audioBuffer,
|
145 |
-
lastStep.turn.nextGapMilisecs
|
146 |
);
|
147 |
}
|
148 |
setNumStepsDone(i + 1);
|
@@ -190,6 +196,7 @@ export const PodcastGenerator = ({
|
|
190 |
))}
|
191 |
</select>
|
192 |
</label>
|
|
|
193 |
<label className="form-control w-full">
|
194 |
<div className="label">
|
195 |
<span className="label-text">Speaker 2</span>
|
@@ -206,12 +213,11 @@ export const PodcastGenerator = ({
|
|
206 |
))}
|
207 |
</select>
|
208 |
</label>
|
209 |
-
</div>
|
210 |
|
211 |
-
<div className="grid grid-cols-2 gap-4">
|
212 |
<button className="btn" onClick={setRandSpeaker}>
|
213 |
Randomize speakers
|
214 |
</button>
|
|
|
215 |
<label className="form-control w-full">
|
216 |
<select
|
217 |
className="select select-bordered"
|
@@ -225,6 +231,17 @@ export const PodcastGenerator = ({
|
|
225 |
))}
|
226 |
</select>
|
227 |
</label>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
</div>
|
229 |
|
230 |
<button
|
|
|
4 |
import { parse } from 'yaml';
|
5 |
import {
|
6 |
addNoise,
|
7 |
+
addSilence,
|
8 |
generateAudio,
|
9 |
joinAudio,
|
10 |
loadWavAndDecode,
|
|
|
90 |
const [speaker1, setSpeaker1] = useState<string>('');
|
91 |
const [speaker2, setSpeaker2] = useState<string>('');
|
92 |
const [speed, setSpeed] = useState<string>('1.2');
|
93 |
+
const [addIntroMusic, setAddIntroMusic] = useState<boolean>(false);
|
94 |
|
95 |
const setRandSpeaker = () => {
|
96 |
const { s1, s2 } = getRandomSpeakerPair();
|
|
|
137 |
step.audioBuffer = await loadWavAndDecode(url);
|
138 |
if (i === 0) {
|
139 |
outputWav = step.audioBuffer;
|
140 |
+
if (addIntroMusic) {
|
141 |
+
const openingSound = await loadWavAndDecode(openingSoundSrc);
|
142 |
+
outputWav = joinAudio(openingSound, outputWav!, -2000);
|
143 |
+
} else {
|
144 |
+
outputWav = addSilence(outputWav!, true, 200);
|
145 |
+
}
|
146 |
} else {
|
147 |
const lastStep = steps[i - 1];
|
148 |
outputWav = joinAudio(
|
149 |
outputWav!,
|
150 |
step.audioBuffer,
|
151 |
+
lastStep.turn.nextGapMilisecs
|
152 |
);
|
153 |
}
|
154 |
setNumStepsDone(i + 1);
|
|
|
196 |
))}
|
197 |
</select>
|
198 |
</label>
|
199 |
+
|
200 |
<label className="form-control w-full">
|
201 |
<div className="label">
|
202 |
<span className="label-text">Speaker 2</span>
|
|
|
213 |
))}
|
214 |
</select>
|
215 |
</label>
|
|
|
216 |
|
|
|
217 |
<button className="btn" onClick={setRandSpeaker}>
|
218 |
Randomize speakers
|
219 |
</button>
|
220 |
+
|
221 |
<label className="form-control w-full">
|
222 |
<select
|
223 |
className="select select-bordered"
|
|
|
231 |
))}
|
232 |
</select>
|
233 |
</label>
|
234 |
+
|
235 |
+
<div className="flex items-center gap-2">
|
236 |
+
<input
|
237 |
+
type="checkbox"
|
238 |
+
className="checkbox"
|
239 |
+
checked={addIntroMusic}
|
240 |
+
onChange={(e) => setAddIntroMusic(e.target.checked)}
|
241 |
+
disabled={isGenerating || busy}
|
242 |
+
/>
|
243 |
+
Add intro music
|
244 |
+
</div>
|
245 |
</div>
|
246 |
|
247 |
<button
|
front/src/utils/prompts.ts
CHANGED
@@ -15,35 +15,85 @@ Some rules:
|
|
15 |
- There can be from 20 to 30 turns in total.
|
16 |
- First turns should be the introduction for the theme and speakers.
|
17 |
- The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
|
18 |
-
- Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
|
19 |
- In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
|
20 |
|
21 |
There is an example (it is truncated):
|
22 |
|
23 |
[START OF EXAMPLE]
|
24 |
\`\`\`yaml
|
25 |
-
title: "
|
26 |
speakerNames:
|
27 |
-
- "
|
28 |
-
- "
|
29 |
turns:
|
30 |
-
- index: 0
|
31 |
-
speakerName: "Alice"
|
32 |
-
text: "It wouldn't be better to follow on technology with Magnus Nystedt than to actually cross over using technology to [Wellington](+1) International School and [Whiz Radio](+1). And who better to kick off the Whiz Radio segment than the person who motivates the students over at Whiz, Wellington International School, than Miss Davy Peel. [Welcome](+2) ... Tell me, give me, give me the scoop here, because I, I came and talked to you a little while ago and said, what do you guys think about doing some kind of a [TEDx](+1) thing? And then I just sort of backed off."
|
33 |
-
nextGapMilisecs: 100
|
34 |
- index: 1
|
35 |
-
speakerName: "
|
36 |
-
text: "[
|
37 |
-
nextGapMilisecs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
\`\`\`
|
39 |
[END OF EXAMPLE]
|
40 |
|
41 |
-
The example above is truncated at index
|
42 |
The output text will be passed to TTS engine, make sure to be clean and natural:
|
43 |
- Write NUMBER and abbreviations as WORDS, as they are pronounced
|
44 |
- For some less-common abbreviations, write the full words
|
45 |
-
- Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation
|
46 |
-
- IMPORTANT!! Write nicknames
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
Make it engaging and have fun!
|
49 |
|
|
|
15 |
- There can be from 20 to 30 turns in total.
|
16 |
- First turns should be the introduction for the theme and speakers.
|
17 |
- The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
|
|
|
18 |
- In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
|
19 |
|
20 |
There is an example (it is truncated):
|
21 |
|
22 |
[START OF EXAMPLE]
|
23 |
\`\`\`yaml
|
24 |
+
title: "Emerging AI Patterns: A Discussion"
|
25 |
speakerNames:
|
26 |
+
- "Alex"
|
27 |
+
- "Jordan"
|
28 |
turns:
|
|
|
|
|
|
|
|
|
29 |
- index: 1
|
30 |
+
speakerName: "Alex"
|
31 |
+
text: "Welcome, [everyone](+2)! I'm Alex, and today, we're diving into emerging AI patterns. I'm joined by Jordan, a researcher and technologist. Jordan and I first met at a tech conference, where we bonded over our shared curiosity about AI trends. We've both followed the Thoughtworks Technology Radar for years, so we're excited to break down the latest themes from Volume [thirty](+1)!"
|
32 |
+
nextGapMilisecs: 50
|
33 |
+
|
34 |
+
- index: 2
|
35 |
+
speakerName: "Jordan"
|
36 |
+
text: "That's right! These Radar themes give insight into where tech is headed. One major focus this time? Large [language](+1) models, or L L M. No surprise ... AI is [everywhere](+2). About [thirty-two](+1) percent of the Radar's blips are tied to generative AI. One key theme: emerging architecture patterns for L L M."
|
37 |
+
nextGapMilisecs: 100
|
38 |
+
|
39 |
+
- index: 3
|
40 |
+
speakerName: "Alex"
|
41 |
+
text: "Let's start with an example ... [Chatbots](+1). Businesses are using L L M for customer service, e-Commerce, even legal and medical support. One standout pattern, is retrieval-augmented generation, or Rag. Jordan, can you break that down?"
|
42 |
+
nextGapMilisecs: 100
|
43 |
+
|
44 |
+
- index: 4
|
45 |
+
speakerName: "Jordan"
|
46 |
+
text: "Absolutely! ... Retrieval-augmented generation, or Rag, is about [dynamically](+1) injecting relevant data into prompts, instead of fine-tuning the model itself. Fine-tuning can be [expensive](+1), and often unnecessary. Rag pulls in fresh, specific information, making responses more relevant. If developers focus on just one AI technique, it should probably be [this](+1)."
|
47 |
+
nextGapMilisecs: -100
|
48 |
+
|
49 |
+
- index: 5
|
50 |
+
speakerName: "Alex"
|
51 |
+
text: "That's a perfect segue to another blip: 'Rush to fine-tuning' ... which is on hold. Many assume fine-tuning is always the answer, but is it?"
|
52 |
+
nextGapMilisecs: 50
|
53 |
+
|
54 |
+
- index: 6
|
55 |
+
speakerName: "Jordan"
|
56 |
+
text: "Not necessarily. Fine-tuning works best for adapting writing styles or learning new patterns, but [not](+2) for adding [facts](+1). If you want an L L M to understand company-specific knowledge, fine-tuning isn't always ideal. Rag is a better, more cost-effective choice."
|
57 |
+
nextGapMilisecs: 100
|
58 |
+
|
59 |
+
- index: 7
|
60 |
+
speakerName: "Alex"
|
61 |
+
text: "Now, getting L L M into production isn't just about retrieval. Monitoring and testing are crucial. I remember you mentioning Lang Fuse ... what does it do?"
|
62 |
+
nextGapMilisecs: 100
|
63 |
+
|
64 |
+
- index: 8
|
65 |
+
speakerName: "Jordan"
|
66 |
+
text: "Lang Fuse helps monitor performance, cost, and response quality. AI outputs aren't always predictable, so we need observability tools. Testing AI is tricky ... it's [non](+1) deterministic. That's where [guardrails](+1) come in. For example, Nemo Guardrails by Nvidia helps filter user inputs and model responses for security and ethical concerns."
|
67 |
+
nextGapMilisecs: 100
|
68 |
+
|
69 |
+
- index: 9
|
70 |
+
speakerName: "Alex"
|
71 |
+
text: "Speaking of optimization, another emerging pattern is combining models. Using a lightweight L L M for most tasks, then a more [powerful](+1) one for validation. Smart cost control, right?"
|
72 |
+
nextGapMilisecs: -100
|
73 |
+
|
74 |
+
- index: 10
|
75 |
+
speakerName: "Jordan"
|
76 |
+
text: "Exactly! This idea came up in discussions in South America. Instead of relying on a [single](+1) high-end model, companies are selectively verifying outputs. Balancing cost and performance is crucial when scaling A I applications."
|
77 |
+
nextGapMilisecs: 100
|
78 |
\`\`\`
|
79 |
[END OF EXAMPLE]
|
80 |
|
81 |
+
The example above is truncated at index 10, REMEMBER TO CREATE AT LEAST 25 TURNS.
|
82 |
The output text will be passed to TTS engine, make sure to be clean and natural:
|
83 |
- Write NUMBER and abbreviations as WORDS, as they are pronounced
|
84 |
- For some less-common abbreviations, write the full words
|
85 |
+
- Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation. Do NOT use dash, use ... instead.
|
86 |
+
- IMPORTANT!! Write nicknames, names, numbers as they are pronounced. For example:
|
87 |
+
- "lora_rank=2" becomes "lora rank equals two"
|
88 |
+
- "LoRA" becomes "Lo Ra"
|
89 |
+
- "CrossEntropyLoss" becomes "Cross Entropy Loss"
|
90 |
+
- "6GB" becomes "six gigabytes"
|
91 |
+
- "A6000" becomes "A six thousands"
|
92 |
+
- "CUDA" becomes "Cu-Da"
|
93 |
+
- (and so on)
|
94 |
+
|
95 |
+
Example of a input text: "Great advice! Thanks, Jordan - That wraps up our discussion. Stay tuned for more deep dives into emerging tech!"
|
96 |
+
Example of output: "Great advice! ... Thanks Jordan! ... That wraps up our discussion. Stay tuned for more deep dives into [emerging](+1) tech!"
|
97 |
|
98 |
Make it engaging and have fun!
|
99 |
|
front/src/utils/utils.ts
CHANGED
@@ -115,7 +115,7 @@ export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
|
|
115 |
export const joinAudio = (
|
116 |
audio1: AudioBuffer,
|
117 |
audio2: AudioBuffer,
|
118 |
-
|
119 |
overlap: 'none' | 'cross-fade' = 'none'
|
120 |
): AudioBuffer => {
|
121 |
const sampleRate = audio1.sampleRate;
|
@@ -129,6 +129,7 @@ export const joinAudio = (
|
|
129 |
throw new Error('Audio buffers must have the same number of channels');
|
130 |
}
|
131 |
|
|
|
132 |
let newLength: number;
|
133 |
|
134 |
if (gapSeconds > 0) {
|
@@ -242,6 +243,42 @@ export const addNoise = (
|
|
242 |
return newBuffer;
|
243 |
};
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
////////////////////////////////////////
|
246 |
// Audio formatting utils
|
247 |
|
|
|
115 |
export const joinAudio = (
|
116 |
audio1: AudioBuffer,
|
117 |
audio2: AudioBuffer,
|
118 |
+
gapMilisecs: number,
|
119 |
overlap: 'none' | 'cross-fade' = 'none'
|
120 |
): AudioBuffer => {
|
121 |
const sampleRate = audio1.sampleRate;
|
|
|
129 |
throw new Error('Audio buffers must have the same number of channels');
|
130 |
}
|
131 |
|
132 |
+
const gapSeconds = gapMilisecs / 1000;
|
133 |
let newLength: number;
|
134 |
|
135 |
if (gapSeconds > 0) {
|
|
|
243 |
return newBuffer;
|
244 |
};
|
245 |
|
246 |
+
export const addSilence = (
|
247 |
+
audioBuffer: AudioBuffer,
|
248 |
+
toBeginning: boolean,
|
249 |
+
durationMilisecs: number
|
250 |
+
): AudioBuffer => {
|
251 |
+
// Convert duration from milliseconds to samples.
|
252 |
+
const sampleRate = audioBuffer.sampleRate;
|
253 |
+
const silenceSamples = Math.round((durationMilisecs / 1000) * sampleRate);
|
254 |
+
const numChannels = audioBuffer.numberOfChannels;
|
255 |
+
const originalLength = audioBuffer.length;
|
256 |
+
const newLength = originalLength + silenceSamples;
|
257 |
+
|
258 |
+
// Create a new AudioBuffer with extra space for the silence.
|
259 |
+
const newBuffer = new AudioBuffer({
|
260 |
+
length: newLength,
|
261 |
+
numberOfChannels: numChannels,
|
262 |
+
sampleRate: sampleRate,
|
263 |
+
});
|
264 |
+
|
265 |
+
// Process each channel: copy original audio into the correct position.
|
266 |
+
for (let channel = 0; channel < numChannels; channel++) {
|
267 |
+
const originalData = audioBuffer.getChannelData(channel);
|
268 |
+
const newData = newBuffer.getChannelData(channel);
|
269 |
+
|
270 |
+
if (toBeginning) {
|
271 |
+
// Leave the first `silenceSamples` as zeros, then copy the original data.
|
272 |
+
newData.set(originalData, silenceSamples);
|
273 |
+
} else {
|
274 |
+
// Copy the original data first; the remaining samples are already zeros.
|
275 |
+
newData.set(originalData, 0);
|
276 |
+
}
|
277 |
+
}
|
278 |
+
|
279 |
+
return newBuffer;
|
280 |
+
};
|
281 |
+
|
282 |
////////////////////////////////////////
|
283 |
// Audio formatting utils
|
284 |
|
index.html
CHANGED
@@ -14721,7 +14721,7 @@ const generateAudio = async (content, voice, speed = 1.1) => {
|
|
14721 |
const pickRand = (arr) => {
|
14722 |
return arr[Math.floor(Math.random() * arr.length)];
|
14723 |
};
|
14724 |
-
const joinAudio = (audio1, audio2,
|
14725 |
const sampleRate = audio1.sampleRate;
|
14726 |
const numChannels = audio1.numberOfChannels;
|
14727 |
if (audio2.sampleRate !== sampleRate) {
|
@@ -14730,6 +14730,7 @@ const joinAudio = (audio1, audio2, gapSeconds, overlap = "none") => {
|
|
14730 |
if (audio2.numberOfChannels !== numChannels) {
|
14731 |
throw new Error("Audio buffers must have the same number of channels");
|
14732 |
}
|
|
|
14733 |
let newLength;
|
14734 |
if (gapSeconds > 0) {
|
14735 |
const gapSamples = Math.round(gapSeconds * sampleRate);
|
@@ -14809,6 +14810,26 @@ const addNoise = (audioBuffer, magnitude) => {
|
|
14809 |
}
|
14810 |
return newBuffer;
|
14811 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14812 |
const loadWavAndDecode = async (url) => {
|
14813 |
const response = await fetch(url);
|
14814 |
const arrayBuffer = await response.arrayBuffer();
|
@@ -20988,6 +21009,7 @@ const PodcastGenerator = ({
|
|
20988 |
const [speaker1, setSpeaker1] = reactExports.useState("");
|
20989 |
const [speaker2, setSpeaker2] = reactExports.useState("");
|
20990 |
const [speed, setSpeed] = reactExports.useState("1.2");
|
|
|
20991 |
const setRandSpeaker = () => {
|
20992 |
const { s1, s2 } = getRandomSpeakerPair();
|
20993 |
setSpeaker1(s1);
|
@@ -21025,14 +21047,18 @@ const PodcastGenerator = ({
|
|
21025 |
step.audioBuffer = await loadWavAndDecode(url);
|
21026 |
if (i === 0) {
|
21027 |
outputWav = step.audioBuffer;
|
21028 |
-
|
21029 |
-
|
|
|
|
|
|
|
|
|
21030 |
} else {
|
21031 |
const lastStep = steps[i - 1];
|
21032 |
outputWav = joinAudio(
|
21033 |
outputWav,
|
21034 |
step.audioBuffer,
|
21035 |
-
lastStep.turn.nextGapMilisecs
|
21036 |
);
|
21037 |
}
|
21038 |
setNumStepsDone(i + 1);
|
@@ -21085,9 +21111,7 @@ const PodcastGenerator = ({
|
|
21085 |
children: SPEAKERS.map((s) => /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: s.value, children: s.name }, s.value))
|
21086 |
}
|
21087 |
)
|
21088 |
-
] })
|
21089 |
-
] }),
|
21090 |
-
/* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: "grid grid-cols-2 gap-4", children: [
|
21091 |
/* @__PURE__ */ jsxRuntimeExports.jsx("button", { className: "btn", onClick: setRandSpeaker, children: "Randomize speakers" }),
|
21092 |
/* @__PURE__ */ jsxRuntimeExports.jsx("label", { className: "form-control w-full", children: /* @__PURE__ */ jsxRuntimeExports.jsx(
|
21093 |
"select",
|
@@ -21103,7 +21127,20 @@ const PodcastGenerator = ({
|
|
21103 |
")"
|
21104 |
] }, s.value))
|
21105 |
}
|
21106 |
-
) })
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21107 |
] }),
|
21108 |
/* @__PURE__ */ jsxRuntimeExports.jsx(
|
21109 |
"button",
|
@@ -21152,35 +21189,85 @@ Some rules:
|
|
21152 |
- There can be from 20 to 30 turns in total.
|
21153 |
- First turns should be the introduction for the theme and speakers.
|
21154 |
- The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
|
21155 |
-
- Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
|
21156 |
- In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
|
21157 |
|
21158 |
There is an example (it is truncated):
|
21159 |
|
21160 |
[START OF EXAMPLE]
|
21161 |
\`\`\`yaml
|
21162 |
-
title: "
|
21163 |
speakerNames:
|
21164 |
-
- "
|
21165 |
-
- "
|
21166 |
turns:
|
21167 |
-
- index: 0
|
21168 |
-
speakerName: "Alice"
|
21169 |
-
text: "It wouldn't be better to follow on technology with Magnus Nystedt than to actually cross over using technology to [Wellington](+1) International School and [Whiz Radio](+1). And who better to kick off the Whiz Radio segment than the person who motivates the students over at Whiz, Wellington International School, than Miss Davy Peel. [Welcome](+2) ... Tell me, give me, give me the scoop here, because I, I came and talked to you a little while ago and said, what do you guys think about doing some kind of a [TEDx](+1) thing? And then I just sort of backed off."
|
21170 |
-
nextGapMilisecs: 100
|
21171 |
- index: 1
|
21172 |
-
speakerName: "
|
21173 |
-
text: "[
|
21174 |
-
nextGapMilisecs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21175 |
\`\`\`
|
21176 |
[END OF EXAMPLE]
|
21177 |
|
21178 |
-
The example above is truncated at index
|
21179 |
The output text will be passed to TTS engine, make sure to be clean and natural:
|
21180 |
- Write NUMBER and abbreviations as WORDS, as they are pronounced
|
21181 |
- For some less-common abbreviations, write the full words
|
21182 |
-
- Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation
|
21183 |
-
- IMPORTANT!! Write nicknames
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21184 |
|
21185 |
Make it engaging and have fun!
|
21186 |
|
@@ -27640,6 +27727,21 @@ html {
|
|
27640 |
--tw-text-opacity: 1;
|
27641 |
color: var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));
|
27642 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27643 |
@media (hover: hover) {
|
27644 |
|
27645 |
.btm-nav > *.disabled:hover,
|
@@ -28096,6 +28198,49 @@ html {
|
|
28096 |
overflow: hidden;
|
28097 |
border-radius: inherit;
|
28098 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28099 |
@keyframes checkmark {
|
28100 |
|
28101 |
0% {
|
@@ -28209,6 +28354,14 @@ html {
|
|
28209 |
.loading-lg {
|
28210 |
width: 2.5rem;
|
28211 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28212 |
.mockup-browser .mockup-browser-toolbar .input {
|
28213 |
position: relative;
|
28214 |
margin-left: auto;
|
@@ -28829,6 +28982,9 @@ html {
|
|
28829 |
.items-center {
|
28830 |
align-items: center;
|
28831 |
}
|
|
|
|
|
|
|
28832 |
.gap-4 {
|
28833 |
gap: 1rem;
|
28834 |
}
|
|
|
14721 |
const pickRand = (arr) => {
|
14722 |
return arr[Math.floor(Math.random() * arr.length)];
|
14723 |
};
|
14724 |
+
const joinAudio = (audio1, audio2, gapMilisecs, overlap = "none") => {
|
14725 |
const sampleRate = audio1.sampleRate;
|
14726 |
const numChannels = audio1.numberOfChannels;
|
14727 |
if (audio2.sampleRate !== sampleRate) {
|
|
|
14730 |
if (audio2.numberOfChannels !== numChannels) {
|
14731 |
throw new Error("Audio buffers must have the same number of channels");
|
14732 |
}
|
14733 |
+
const gapSeconds = gapMilisecs / 1e3;
|
14734 |
let newLength;
|
14735 |
if (gapSeconds > 0) {
|
14736 |
const gapSamples = Math.round(gapSeconds * sampleRate);
|
|
|
14810 |
}
|
14811 |
return newBuffer;
|
14812 |
};
|
14813 |
+
const addSilence = (audioBuffer, toBeginning, durationMilisecs) => {
|
14814 |
+
const sampleRate = audioBuffer.sampleRate;
|
14815 |
+
const silenceSamples = Math.round(durationMilisecs / 1e3 * sampleRate);
|
14816 |
+
const numChannels = audioBuffer.numberOfChannels;
|
14817 |
+
const originalLength = audioBuffer.length;
|
14818 |
+
const newLength = originalLength + silenceSamples;
|
14819 |
+
const newBuffer = new AudioBuffer({
|
14820 |
+
length: newLength,
|
14821 |
+
numberOfChannels: numChannels,
|
14822 |
+
sampleRate
|
14823 |
+
});
|
14824 |
+
for (let channel = 0; channel < numChannels; channel++) {
|
14825 |
+
const originalData = audioBuffer.getChannelData(channel);
|
14826 |
+
const newData = newBuffer.getChannelData(channel);
|
14827 |
+
{
|
14828 |
+
newData.set(originalData, silenceSamples);
|
14829 |
+
}
|
14830 |
+
}
|
14831 |
+
return newBuffer;
|
14832 |
+
};
|
14833 |
const loadWavAndDecode = async (url) => {
|
14834 |
const response = await fetch(url);
|
14835 |
const arrayBuffer = await response.arrayBuffer();
|
|
|
21009 |
const [speaker1, setSpeaker1] = reactExports.useState("");
|
21010 |
const [speaker2, setSpeaker2] = reactExports.useState("");
|
21011 |
const [speed, setSpeed] = reactExports.useState("1.2");
|
21012 |
+
const [addIntroMusic, setAddIntroMusic] = reactExports.useState(false);
|
21013 |
const setRandSpeaker = () => {
|
21014 |
const { s1, s2 } = getRandomSpeakerPair();
|
21015 |
setSpeaker1(s1);
|
|
|
21047 |
step.audioBuffer = await loadWavAndDecode(url);
|
21048 |
if (i === 0) {
|
21049 |
outputWav = step.audioBuffer;
|
21050 |
+
if (addIntroMusic) {
|
21051 |
+
const openingSound = await loadWavAndDecode(openingSoundSrc);
|
21052 |
+
outputWav = joinAudio(openingSound, outputWav, -2e3);
|
21053 |
+
} else {
|
21054 |
+
outputWav = addSilence(outputWav, true, 200);
|
21055 |
+
}
|
21056 |
} else {
|
21057 |
const lastStep = steps[i - 1];
|
21058 |
outputWav = joinAudio(
|
21059 |
outputWav,
|
21060 |
step.audioBuffer,
|
21061 |
+
lastStep.turn.nextGapMilisecs
|
21062 |
);
|
21063 |
}
|
21064 |
setNumStepsDone(i + 1);
|
|
|
21111 |
children: SPEAKERS.map((s) => /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: s.value, children: s.name }, s.value))
|
21112 |
}
|
21113 |
)
|
21114 |
+
] }),
|
|
|
|
|
21115 |
/* @__PURE__ */ jsxRuntimeExports.jsx("button", { className: "btn", onClick: setRandSpeaker, children: "Randomize speakers" }),
|
21116 |
/* @__PURE__ */ jsxRuntimeExports.jsx("label", { className: "form-control w-full", children: /* @__PURE__ */ jsxRuntimeExports.jsx(
|
21117 |
"select",
|
|
|
21127 |
")"
|
21128 |
] }, s.value))
|
21129 |
}
|
21130 |
+
) }),
|
21131 |
+
/* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: "flex items-center gap-2", children: [
|
21132 |
+
/* @__PURE__ */ jsxRuntimeExports.jsx(
|
21133 |
+
"input",
|
21134 |
+
{
|
21135 |
+
type: "checkbox",
|
21136 |
+
className: "checkbox",
|
21137 |
+
checked: addIntroMusic,
|
21138 |
+
onChange: (e) => setAddIntroMusic(e.target.checked),
|
21139 |
+
disabled: isGenerating || busy
|
21140 |
+
}
|
21141 |
+
),
|
21142 |
+
"Add intro music"
|
21143 |
+
] })
|
21144 |
] }),
|
21145 |
/* @__PURE__ */ jsxRuntimeExports.jsx(
|
21146 |
"button",
|
|
|
21189 |
- There can be from 20 to 30 turns in total.
|
21190 |
- First turns should be the introduction for the theme and speakers.
|
21191 |
- The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
|
|
|
21192 |
- In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
|
21193 |
|
21194 |
There is an example (it is truncated):
|
21195 |
|
21196 |
[START OF EXAMPLE]
|
21197 |
\`\`\`yaml
|
21198 |
+
title: "Emerging AI Patterns: A Discussion"
|
21199 |
speakerNames:
|
21200 |
+
- "Alex"
|
21201 |
+
- "Jordan"
|
21202 |
turns:
|
|
|
|
|
|
|
|
|
21203 |
- index: 1
|
21204 |
+
speakerName: "Alex"
|
21205 |
+
text: "Welcome, [everyone](+2)! I'm Alex, and today, we're diving into emerging AI patterns. I'm joined by Jordan, a researcher and technologist. Jordan and I first met at a tech conference, where we bonded over our shared curiosity about AI trends. We've both followed the Thoughtworks Technology Radar for years, so we're excited to break down the latest themes from Volume [thirty](+1)!"
|
21206 |
+
nextGapMilisecs: 50
|
21207 |
+
|
21208 |
+
- index: 2
|
21209 |
+
speakerName: "Jordan"
|
21210 |
+
text: "That's right! These Radar themes give insight into where tech is headed. One major focus this time? Large [language](+1) models, or L L M. No surprise ... AI is [everywhere](+2). About [thirty-two](+1) percent of the Radar's blips are tied to generative AI. One key theme: emerging architecture patterns for L L M."
|
21211 |
+
nextGapMilisecs: 100
|
21212 |
+
|
21213 |
+
- index: 3
|
21214 |
+
speakerName: "Alex"
|
21215 |
+
text: "Let's start with an example ... [Chatbots](+1). Businesses are using L L M for customer service, e-Commerce, even legal and medical support. One standout pattern, is retrieval-augmented generation, or Rag. Jordan, can you break that down?"
|
21216 |
+
nextGapMilisecs: 100
|
21217 |
+
|
21218 |
+
- index: 4
|
21219 |
+
speakerName: "Jordan"
|
21220 |
+
text: "Absolutely! ... Retrieval-augmented generation, or Rag, is about [dynamically](+1) injecting relevant data into prompts, instead of fine-tuning the model itself. Fine-tuning can be [expensive](+1), and often unnecessary. Rag pulls in fresh, specific information, making responses more relevant. If developers focus on just one AI technique, it should probably be [this](+1)."
|
21221 |
+
nextGapMilisecs: -100
|
21222 |
+
|
21223 |
+
- index: 5
|
21224 |
+
speakerName: "Alex"
|
21225 |
+
text: "That's a perfect segue to another blip: 'Rush to fine-tuning' ... which is on hold. Many assume fine-tuning is always the answer, but is it?"
|
21226 |
+
nextGapMilisecs: 50
|
21227 |
+
|
21228 |
+
- index: 6
|
21229 |
+
speakerName: "Jordan"
|
21230 |
+
text: "Not necessarily. Fine-tuning works best for adapting writing styles or learning new patterns, but [not](+2) for adding [facts](+1). If you want an L L M to understand company-specific knowledge, fine-tuning isn't always ideal. Rag is a better, more cost-effective choice."
|
21231 |
+
nextGapMilisecs: 100
|
21232 |
+
|
21233 |
+
- index: 7
|
21234 |
+
speakerName: "Alex"
|
21235 |
+
text: "Now, getting L L M into production isn't just about retrieval. Monitoring and testing are crucial. I remember you mentioning Lang Fuse ... what does it do?"
|
21236 |
+
nextGapMilisecs: 100
|
21237 |
+
|
21238 |
+
- index: 8
|
21239 |
+
speakerName: "Jordan"
|
21240 |
+
text: "Lang Fuse helps monitor performance, cost, and response quality. AI outputs aren't always predictable, so we need observability tools. Testing AI is tricky ... it's [non](+1) deterministic. That's where [guardrails](+1) come in. For example, Nemo Guardrails by Nvidia helps filter user inputs and model responses for security and ethical concerns."
|
21241 |
+
nextGapMilisecs: 100
|
21242 |
+
|
21243 |
+
- index: 9
|
21244 |
+
speakerName: "Alex"
|
21245 |
+
text: "Speaking of optimization, another emerging pattern is combining models. Using a lightweight L L M for most tasks, then a more [powerful](+1) one for validation. Smart cost control, right?"
|
21246 |
+
nextGapMilisecs: -100
|
21247 |
+
|
21248 |
+
- index: 10
|
21249 |
+
speakerName: "Jordan"
|
21250 |
+
text: "Exactly! This idea came up in discussions in South America. Instead of relying on a [single](+1) high-end model, companies are selectively verifying outputs. Balancing cost and performance is crucial when scaling A I applications."
|
21251 |
+
nextGapMilisecs: 100
|
21252 |
\`\`\`
|
21253 |
[END OF EXAMPLE]
|
21254 |
|
21255 |
+
The example above is truncated at index 10, REMEMBER TO CREATE AT LEAST 25 TURNS.
|
21256 |
The output text will be passed to TTS engine, make sure to be clean and natural:
|
21257 |
- Write NUMBER and abbreviations as WORDS, as they are pronounced
|
21258 |
- For some less-common abbreviations, write the full words
|
21259 |
+
- Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation. Do NOT use dash, use ... instead.
|
21260 |
+
- IMPORTANT!! Write nicknames, names, numbers as they are pronounced. For example:
|
21261 |
+
- "lora_rank=2" becomes "lora rank equals two"
|
21262 |
+
- "LoRA" becomes "Lo Ra"
|
21263 |
+
- "CrossEntropyLoss" becomes "Cross Entropy Loss"
|
21264 |
+
- "6GB" becomes "six gigabytes"
|
21265 |
+
- "A6000" becomes "A six thousands"
|
21266 |
+
- "CUDA" becomes "Cu-Da"
|
21267 |
+
- (and so on)
|
21268 |
+
|
21269 |
+
Example of a input text: "Great advice! Thanks, Jordan - That wraps up our discussion. Stay tuned for more deep dives into emerging tech!"
|
21270 |
+
Example of output: "Great advice! ... Thanks Jordan! ... That wraps up our discussion. Stay tuned for more deep dives into [emerging](+1) tech!"
|
21271 |
|
21272 |
Make it engaging and have fun!
|
21273 |
|
|
|
27727 |
--tw-text-opacity: 1;
|
27728 |
color: var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));
|
27729 |
}
|
27730 |
+
.checkbox {
|
27731 |
+
flex-shrink: 0;
|
27732 |
+
--chkbg: var(--fallback-bc,oklch(var(--bc)/1));
|
27733 |
+
--chkfg: var(--fallback-b1,oklch(var(--b1)/1));
|
27734 |
+
height: 1.5rem;
|
27735 |
+
width: 1.5rem;
|
27736 |
+
cursor: pointer;
|
27737 |
+
-webkit-appearance: none;
|
27738 |
+
-moz-appearance: none;
|
27739 |
+
appearance: none;
|
27740 |
+
border-radius: var(--rounded-btn, 0.5rem);
|
27741 |
+
border-width: 1px;
|
27742 |
+
border-color: var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));
|
27743 |
+
--tw-border-opacity: 0.2;
|
27744 |
+
}
|
27745 |
@media (hover: hover) {
|
27746 |
|
27747 |
.btm-nav > *.disabled:hover,
|
|
|
28198 |
overflow: hidden;
|
28199 |
border-radius: inherit;
|
28200 |
}
|
28201 |
+
.checkbox:focus {
|
28202 |
+
box-shadow: none;
|
28203 |
+
}
|
28204 |
+
.checkbox:focus-visible {
|
28205 |
+
outline-style: solid;
|
28206 |
+
outline-width: 2px;
|
28207 |
+
outline-offset: 2px;
|
28208 |
+
outline-color: var(--fallback-bc,oklch(var(--bc)/1));
|
28209 |
+
}
|
28210 |
+
.checkbox:disabled {
|
28211 |
+
border-width: 0px;
|
28212 |
+
cursor: not-allowed;
|
28213 |
+
border-color: transparent;
|
28214 |
+
--tw-bg-opacity: 1;
|
28215 |
+
background-color: var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));
|
28216 |
+
opacity: 0.2;
|
28217 |
+
}
|
28218 |
+
.checkbox:checked,
|
28219 |
+
.checkbox[aria-checked="true"] {
|
28220 |
+
background-repeat: no-repeat;
|
28221 |
+
animation: checkmark var(--animation-input, 0.2s) ease-out;
|
28222 |
+
background-color: var(--chkbg);
|
28223 |
+
background-image: linear-gradient(-45deg, transparent 65%, var(--chkbg) 65.99%),
|
28224 |
+
linear-gradient(45deg, transparent 75%, var(--chkbg) 75.99%),
|
28225 |
+
linear-gradient(-45deg, var(--chkbg) 40%, transparent 40.99%),
|
28226 |
+
linear-gradient(
|
28227 |
+
45deg,
|
28228 |
+
var(--chkbg) 30%,
|
28229 |
+
var(--chkfg) 30.99%,
|
28230 |
+
var(--chkfg) 40%,
|
28231 |
+
transparent 40.99%
|
28232 |
+
),
|
28233 |
+
linear-gradient(-45deg, var(--chkfg) 50%, var(--chkbg) 50.99%);
|
28234 |
+
}
|
28235 |
+
.checkbox:indeterminate {
|
28236 |
+
--tw-bg-opacity: 1;
|
28237 |
+
background-color: var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));
|
28238 |
+
background-repeat: no-repeat;
|
28239 |
+
animation: checkmark var(--animation-input, 0.2s) ease-out;
|
28240 |
+
background-image: linear-gradient(90deg, transparent 80%, var(--chkbg) 80%),
|
28241 |
+
linear-gradient(-90deg, transparent 80%, var(--chkbg) 80%),
|
28242 |
+
linear-gradient(0deg, var(--chkbg) 43%, var(--chkfg) 43%, var(--chkfg) 57%, var(--chkbg) 57%);
|
28243 |
+
}
|
28244 |
@keyframes checkmark {
|
28245 |
|
28246 |
0% {
|
|
|
28354 |
.loading-lg {
|
28355 |
width: 2.5rem;
|
28356 |
}
|
28357 |
+
:where(.menu li:not(.menu-title, .disabled) > *:not(ul, details, .menu-title)):not(summary, .active, .btn).focus, :where(.menu li:not(.menu-title, .disabled) > *:not(ul, details, .menu-title)):not(summary, .active, .btn):focus, :where(.menu li:not(.menu-title, .disabled) > *:not(ul, details, .menu-title)):is(summary):not(.active, .btn):focus-visible, :where(.menu li:not(.menu-title, .disabled) > details > summary:not(.menu-title)):not(summary, .active, .btn).focus, :where(.menu li:not(.menu-title, .disabled) > details > summary:not(.menu-title)):not(summary, .active, .btn):focus, :where(.menu li:not(.menu-title, .disabled) > details > summary:not(.menu-title)):is(summary):not(.active, .btn):focus-visible {
|
28358 |
+
cursor: pointer;
|
28359 |
+
background-color: var(--fallback-bc,oklch(var(--bc)/0.1));
|
28360 |
+
--tw-text-opacity: 1;
|
28361 |
+
color: var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));
|
28362 |
+
outline: 2px solid transparent;
|
28363 |
+
outline-offset: 2px;
|
28364 |
+
}
|
28365 |
.mockup-browser .mockup-browser-toolbar .input {
|
28366 |
position: relative;
|
28367 |
margin-left: auto;
|
|
|
28982 |
.items-center {
|
28983 |
align-items: center;
|
28984 |
}
|
28985 |
+
.gap-2 {
|
28986 |
+
gap: 0.5rem;
|
28987 |
+
}
|
28988 |
.gap-4 {
|
28989 |
gap: 1rem;
|
28990 |
}
|