ngxson HF Staff commited on
Commit
34d0553
·
1 Parent(s): 9ad1182

allow disable intro music, refine prompt

Browse files
front/src/components/PodcastGenerator.tsx CHANGED
@@ -4,6 +4,7 @@ import { Podcast, PodcastTurn } from '../utils/types';
4
  import { parse } from 'yaml';
5
  import {
6
  addNoise,
 
7
  generateAudio,
8
  joinAudio,
9
  loadWavAndDecode,
@@ -89,6 +90,7 @@ export const PodcastGenerator = ({
89
  const [speaker1, setSpeaker1] = useState<string>('');
90
  const [speaker2, setSpeaker2] = useState<string>('');
91
  const [speed, setSpeed] = useState<string>('1.2');
 
92
 
93
  const setRandSpeaker = () => {
94
  const { s1, s2 } = getRandomSpeakerPair();
@@ -135,14 +137,18 @@ export const PodcastGenerator = ({
135
  step.audioBuffer = await loadWavAndDecode(url);
136
  if (i === 0) {
137
  outputWav = step.audioBuffer;
138
- const openingSound = await loadWavAndDecode(openingSoundSrc);
139
- outputWav = joinAudio(openingSound, outputWav!, -2);
 
 
 
 
140
  } else {
141
  const lastStep = steps[i - 1];
142
  outputWav = joinAudio(
143
  outputWav!,
144
  step.audioBuffer,
145
- lastStep.turn.nextGapMilisecs / 1000
146
  );
147
  }
148
  setNumStepsDone(i + 1);
@@ -190,6 +196,7 @@ export const PodcastGenerator = ({
190
  ))}
191
  </select>
192
  </label>
 
193
  <label className="form-control w-full">
194
  <div className="label">
195
  <span className="label-text">Speaker 2</span>
@@ -206,12 +213,11 @@ export const PodcastGenerator = ({
206
  ))}
207
  </select>
208
  </label>
209
- </div>
210
 
211
- <div className="grid grid-cols-2 gap-4">
212
  <button className="btn" onClick={setRandSpeaker}>
213
  Randomize speakers
214
  </button>
 
215
  <label className="form-control w-full">
216
  <select
217
  className="select select-bordered"
@@ -225,6 +231,17 @@ export const PodcastGenerator = ({
225
  ))}
226
  </select>
227
  </label>
 
 
 
 
 
 
 
 
 
 
 
228
  </div>
229
 
230
  <button
 
4
  import { parse } from 'yaml';
5
  import {
6
  addNoise,
7
+ addSilence,
8
  generateAudio,
9
  joinAudio,
10
  loadWavAndDecode,
 
90
  const [speaker1, setSpeaker1] = useState<string>('');
91
  const [speaker2, setSpeaker2] = useState<string>('');
92
  const [speed, setSpeed] = useState<string>('1.2');
93
+ const [addIntroMusic, setAddIntroMusic] = useState<boolean>(false);
94
 
95
  const setRandSpeaker = () => {
96
  const { s1, s2 } = getRandomSpeakerPair();
 
137
  step.audioBuffer = await loadWavAndDecode(url);
138
  if (i === 0) {
139
  outputWav = step.audioBuffer;
140
+ if (addIntroMusic) {
141
+ const openingSound = await loadWavAndDecode(openingSoundSrc);
142
+ outputWav = joinAudio(openingSound, outputWav!, -2000);
143
+ } else {
144
+ outputWav = addSilence(outputWav!, true, 200);
145
+ }
146
  } else {
147
  const lastStep = steps[i - 1];
148
  outputWav = joinAudio(
149
  outputWav!,
150
  step.audioBuffer,
151
+ lastStep.turn.nextGapMilisecs
152
  );
153
  }
154
  setNumStepsDone(i + 1);
 
196
  ))}
197
  </select>
198
  </label>
199
+
200
  <label className="form-control w-full">
201
  <div className="label">
202
  <span className="label-text">Speaker 2</span>
 
213
  ))}
214
  </select>
215
  </label>
 
216
 
 
217
  <button className="btn" onClick={setRandSpeaker}>
218
  Randomize speakers
219
  </button>
220
+
221
  <label className="form-control w-full">
222
  <select
223
  className="select select-bordered"
 
231
  ))}
232
  </select>
233
  </label>
234
+
235
+ <div className="flex items-center gap-2">
236
+ <input
237
+ type="checkbox"
238
+ className="checkbox"
239
+ checked={addIntroMusic}
240
+ onChange={(e) => setAddIntroMusic(e.target.checked)}
241
+ disabled={isGenerating || busy}
242
+ />
243
+ Add intro music
244
+ </div>
245
  </div>
246
 
247
  <button
front/src/utils/prompts.ts CHANGED
@@ -15,35 +15,85 @@ Some rules:
15
  - There can be from 20 to 30 turns in total.
16
  - First turns should be the introduction for the theme and speakers.
17
  - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
18
- - Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
19
  - In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
20
 
21
  There is an example (it is truncated):
22
 
23
  [START OF EXAMPLE]
24
  \`\`\`yaml
25
- title: "Podcast about the history of the Internet"
26
  speakerNames:
27
- - "Alice"
28
- - "Davy Peel"
29
  turns:
30
- - index: 0
31
- speakerName: "Alice"
32
- text: "It wouldn't be better to follow on technology with Magnus Nystedt than to actually cross over using technology to [Wellington](+1) International School and [Whiz Radio](+1). And who better to kick off the Whiz Radio segment than the person who motivates the students over at Whiz, Wellington International School, than Miss Davy Peel. [Welcome](+2) ... Tell me, give me, give me the scoop here, because I, I came and talked to you a little while ago and said, what do you guys think about doing some kind of a [TEDx](+1) thing? And then I just sort of backed off."
33
- nextGapMilisecs: 100
34
  - index: 1
35
- speakerName: "Davy Peel"
36
- text: "[Absolutely](+1)... An amazing opportunity to get the kids a bit of kind of creative freedom and, and a forum."
37
- nextGapMilisecs: 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  \`\`\`
39
  [END OF EXAMPLE]
40
 
41
- The example above is truncated at index 1, REMEMBER TO CREATE AT LEAST 20 TURNS.
42
  The output text will be passed to TTS engine, make sure to be clean and natural:
43
  - Write NUMBER and abbreviations as WORDS, as they are pronounced
44
  - For some less-common abbreviations, write the full words
45
- - Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation
46
- - IMPORTANT!! Write nicknames and names as they are pronounced. For example, "lora_rank=2" becomes "lora rank equals two", or "LoRA" becomes "Lo Ra", or "CrossEntropyLoss" becomes "Cross Entropy Loss", or "6GB" becomes "six gigabytes", "A6000" becomes "A six thousands"
 
 
 
 
 
 
 
 
 
 
47
 
48
  Make it engaging and have fun!
49
 
 
15
  - There can be from 20 to 30 turns in total.
16
  - First turns should be the introduction for the theme and speakers.
17
  - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
 
18
  - In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
19
 
20
  There is an example (it is truncated):
21
 
22
  [START OF EXAMPLE]
23
  \`\`\`yaml
24
+ title: "Emerging AI Patterns: A Discussion"
25
  speakerNames:
26
+ - "Alex"
27
+ - "Jordan"
28
  turns:
 
 
 
 
29
  - index: 1
30
+ speakerName: "Alex"
31
+ text: "Welcome, [everyone](+2)! I'm Alex, and today, we're diving into emerging AI patterns. I'm joined by Jordan, a researcher and technologist. Jordan and I first met at a tech conference, where we bonded over our shared curiosity about AI trends. We've both followed the Thoughtworks Technology Radar for years, so we're excited to break down the latest themes from Volume [thirty](+1)!"
32
+ nextGapMilisecs: 50
33
+
34
+ - index: 2
35
+ speakerName: "Jordan"
36
+ text: "That's right! These Radar themes give insight into where tech is headed. One major focus this time? Large [language](+1) models, or L L M. No surprise ... AI is [everywhere](+2). About [thirty-two](+1) percent of the Radar's blips are tied to generative AI. One key theme: emerging architecture patterns for L L M."
37
+ nextGapMilisecs: 100
38
+
39
+ - index: 3
40
+ speakerName: "Alex"
41
+ text: "Let's start with an example ... [Chatbots](+1). Businesses are using L L M for customer service, e-Commerce, even legal and medical support. One standout pattern, is retrieval-augmented generation, or Rag. Jordan, can you break that down?"
42
+ nextGapMilisecs: 100
43
+
44
+ - index: 4
45
+ speakerName: "Jordan"
46
+ text: "Absolutely! ... Retrieval-augmented generation, or Rag, is about [dynamically](+1) injecting relevant data into prompts, instead of fine-tuning the model itself. Fine-tuning can be [expensive](+1), and often unnecessary. Rag pulls in fresh, specific information, making responses more relevant. If developers focus on just one AI technique, it should probably be [this](+1)."
47
+ nextGapMilisecs: -100
48
+
49
+ - index: 5
50
+ speakerName: "Alex"
51
+ text: "That's a perfect segue to another blip: 'Rush to fine-tuning' ... which is on hold. Many assume fine-tuning is always the answer, but is it?"
52
+ nextGapMilisecs: 50
53
+
54
+ - index: 6
55
+ speakerName: "Jordan"
56
+ text: "Not necessarily. Fine-tuning works best for adapting writing styles or learning new patterns, but [not](+2) for adding [facts](+1). If you want an L L M to understand company-specific knowledge, fine-tuning isn't always ideal. Rag is a better, more cost-effective choice."
57
+ nextGapMilisecs: 100
58
+
59
+ - index: 7
60
+ speakerName: "Alex"
61
+ text: "Now, getting L L M into production isn't just about retrieval. Monitoring and testing are crucial. I remember you mentioning Lang Fuse ... what does it do?"
62
+ nextGapMilisecs: 100
63
+
64
+ - index: 8
65
+ speakerName: "Jordan"
66
+ text: "Lang Fuse helps monitor performance, cost, and response quality. AI outputs aren't always predictable, so we need observability tools. Testing AI is tricky ... it's [non](+1) deterministic. That's where [guardrails](+1) come in. For example, Nemo Guardrails by Nvidia helps filter user inputs and model responses for security and ethical concerns."
67
+ nextGapMilisecs: 100
68
+
69
+ - index: 9
70
+ speakerName: "Alex"
71
+ text: "Speaking of optimization, another emerging pattern is combining models. Using a lightweight L L M for most tasks, then a more [powerful](+1) one for validation. Smart cost control, right?"
72
+ nextGapMilisecs: -100
73
+
74
+ - index: 10
75
+ speakerName: "Jordan"
76
+ text: "Exactly! This idea came up in discussions in South America. Instead of relying on a [single](+1) high-end model, companies are selectively verifying outputs. Balancing cost and performance is crucial when scaling A I applications."
77
+ nextGapMilisecs: 100
78
  \`\`\`
79
  [END OF EXAMPLE]
80
 
81
+ The example above is truncated at index 10, REMEMBER TO CREATE AT LEAST 25 TURNS.
82
  The output text will be passed to TTS engine, make sure to be clean and natural:
83
  - Write NUMBER and abbreviations as WORDS, as they are pronounced
84
  - For some less-common abbreviations, write the full words
85
+ - Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation. Do NOT use dash, use ... instead.
86
+ - IMPORTANT!! Write nicknames, names, numbers as they are pronounced. For example:
87
+ - "lora_rank=2" becomes "lora rank equals two"
88
+ - "LoRA" becomes "Lo Ra"
89
+ - "CrossEntropyLoss" becomes "Cross Entropy Loss"
90
+ - "6GB" becomes "six gigabytes"
91
+ - "A6000" becomes "A six thousands"
92
+ - "CUDA" becomes "Cu-Da"
93
+ - (and so on)
94
+
95
+ Example of a input text: "Great advice! Thanks, Jordan - That wraps up our discussion. Stay tuned for more deep dives into emerging tech!"
96
+ Example of output: "Great advice! ... Thanks Jordan! ... That wraps up our discussion. Stay tuned for more deep dives into [emerging](+1) tech!"
97
 
98
  Make it engaging and have fun!
99
 
front/src/utils/utils.ts CHANGED
@@ -115,7 +115,7 @@ export const trimSilence = (audioBuffer: AudioBuffer): AudioBuffer => {
115
  export const joinAudio = (
116
  audio1: AudioBuffer,
117
  audio2: AudioBuffer,
118
- gapSeconds: number,
119
  overlap: 'none' | 'cross-fade' = 'none'
120
  ): AudioBuffer => {
121
  const sampleRate = audio1.sampleRate;
@@ -129,6 +129,7 @@ export const joinAudio = (
129
  throw new Error('Audio buffers must have the same number of channels');
130
  }
131
 
 
132
  let newLength: number;
133
 
134
  if (gapSeconds > 0) {
@@ -242,6 +243,42 @@ export const addNoise = (
242
  return newBuffer;
243
  };
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  ////////////////////////////////////////
246
  // Audio formatting utils
247
 
 
115
  export const joinAudio = (
116
  audio1: AudioBuffer,
117
  audio2: AudioBuffer,
118
+ gapMilisecs: number,
119
  overlap: 'none' | 'cross-fade' = 'none'
120
  ): AudioBuffer => {
121
  const sampleRate = audio1.sampleRate;
 
129
  throw new Error('Audio buffers must have the same number of channels');
130
  }
131
 
132
+ const gapSeconds = gapMilisecs / 1000;
133
  let newLength: number;
134
 
135
  if (gapSeconds > 0) {
 
243
  return newBuffer;
244
  };
245
 
246
+ export const addSilence = (
247
+ audioBuffer: AudioBuffer,
248
+ toBeginning: boolean,
249
+ durationMilisecs: number
250
+ ): AudioBuffer => {
251
+ // Convert duration from milliseconds to samples.
252
+ const sampleRate = audioBuffer.sampleRate;
253
+ const silenceSamples = Math.round((durationMilisecs / 1000) * sampleRate);
254
+ const numChannels = audioBuffer.numberOfChannels;
255
+ const originalLength = audioBuffer.length;
256
+ const newLength = originalLength + silenceSamples;
257
+
258
+ // Create a new AudioBuffer with extra space for the silence.
259
+ const newBuffer = new AudioBuffer({
260
+ length: newLength,
261
+ numberOfChannels: numChannels,
262
+ sampleRate: sampleRate,
263
+ });
264
+
265
+ // Process each channel: copy original audio into the correct position.
266
+ for (let channel = 0; channel < numChannels; channel++) {
267
+ const originalData = audioBuffer.getChannelData(channel);
268
+ const newData = newBuffer.getChannelData(channel);
269
+
270
+ if (toBeginning) {
271
+ // Leave the first `silenceSamples` as zeros, then copy the original data.
272
+ newData.set(originalData, silenceSamples);
273
+ } else {
274
+ // Copy the original data first; the remaining samples are already zeros.
275
+ newData.set(originalData, 0);
276
+ }
277
+ }
278
+
279
+ return newBuffer;
280
+ };
281
+
282
  ////////////////////////////////////////
283
  // Audio formatting utils
284
 
index.html CHANGED
@@ -14721,7 +14721,7 @@ const generateAudio = async (content, voice, speed = 1.1) => {
14721
  const pickRand = (arr) => {
14722
  return arr[Math.floor(Math.random() * arr.length)];
14723
  };
14724
- const joinAudio = (audio1, audio2, gapSeconds, overlap = "none") => {
14725
  const sampleRate = audio1.sampleRate;
14726
  const numChannels = audio1.numberOfChannels;
14727
  if (audio2.sampleRate !== sampleRate) {
@@ -14730,6 +14730,7 @@ const joinAudio = (audio1, audio2, gapSeconds, overlap = "none") => {
14730
  if (audio2.numberOfChannels !== numChannels) {
14731
  throw new Error("Audio buffers must have the same number of channels");
14732
  }
 
14733
  let newLength;
14734
  if (gapSeconds > 0) {
14735
  const gapSamples = Math.round(gapSeconds * sampleRate);
@@ -14809,6 +14810,26 @@ const addNoise = (audioBuffer, magnitude) => {
14809
  }
14810
  return newBuffer;
14811
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14812
  const loadWavAndDecode = async (url) => {
14813
  const response = await fetch(url);
14814
  const arrayBuffer = await response.arrayBuffer();
@@ -20988,6 +21009,7 @@ const PodcastGenerator = ({
20988
  const [speaker1, setSpeaker1] = reactExports.useState("");
20989
  const [speaker2, setSpeaker2] = reactExports.useState("");
20990
  const [speed, setSpeed] = reactExports.useState("1.2");
 
20991
  const setRandSpeaker = () => {
20992
  const { s1, s2 } = getRandomSpeakerPair();
20993
  setSpeaker1(s1);
@@ -21025,14 +21047,18 @@ const PodcastGenerator = ({
21025
  step.audioBuffer = await loadWavAndDecode(url);
21026
  if (i === 0) {
21027
  outputWav = step.audioBuffer;
21028
- const openingSound = await loadWavAndDecode(openingSoundSrc);
21029
- outputWav = joinAudio(openingSound, outputWav, -2);
 
 
 
 
21030
  } else {
21031
  const lastStep = steps[i - 1];
21032
  outputWav = joinAudio(
21033
  outputWav,
21034
  step.audioBuffer,
21035
- lastStep.turn.nextGapMilisecs / 1e3
21036
  );
21037
  }
21038
  setNumStepsDone(i + 1);
@@ -21085,9 +21111,7 @@ const PodcastGenerator = ({
21085
  children: SPEAKERS.map((s) => /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: s.value, children: s.name }, s.value))
21086
  }
21087
  )
21088
- ] })
21089
- ] }),
21090
- /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: "grid grid-cols-2 gap-4", children: [
21091
  /* @__PURE__ */ jsxRuntimeExports.jsx("button", { className: "btn", onClick: setRandSpeaker, children: "Randomize speakers" }),
21092
  /* @__PURE__ */ jsxRuntimeExports.jsx("label", { className: "form-control w-full", children: /* @__PURE__ */ jsxRuntimeExports.jsx(
21093
  "select",
@@ -21103,7 +21127,20 @@ const PodcastGenerator = ({
21103
  ")"
21104
  ] }, s.value))
21105
  }
21106
- ) })
 
 
 
 
 
 
 
 
 
 
 
 
 
21107
  ] }),
21108
  /* @__PURE__ */ jsxRuntimeExports.jsx(
21109
  "button",
@@ -21152,35 +21189,85 @@ Some rules:
21152
  - There can be from 20 to 30 turns in total.
21153
  - First turns should be the introduction for the theme and speakers.
21154
  - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
21155
- - Only use base ASCII, do NOT use ALL CAPS, strings are wrapped inside "..."
21156
  - In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
21157
 
21158
  There is an example (it is truncated):
21159
 
21160
  [START OF EXAMPLE]
21161
  \`\`\`yaml
21162
- title: "Podcast about the history of the Internet"
21163
  speakerNames:
21164
- - "Alice"
21165
- - "Davy Peel"
21166
  turns:
21167
- - index: 0
21168
- speakerName: "Alice"
21169
- text: "It wouldn't be better to follow on technology with Magnus Nystedt than to actually cross over using technology to [Wellington](+1) International School and [Whiz Radio](+1). And who better to kick off the Whiz Radio segment than the person who motivates the students over at Whiz, Wellington International School, than Miss Davy Peel. [Welcome](+2) ... Tell me, give me, give me the scoop here, because I, I came and talked to you a little while ago and said, what do you guys think about doing some kind of a [TEDx](+1) thing? And then I just sort of backed off."
21170
- nextGapMilisecs: 100
21171
  - index: 1
21172
- speakerName: "Davy Peel"
21173
- text: "[Absolutely](+1)... An amazing opportunity to get the kids a bit of kind of creative freedom and, and a forum."
21174
- nextGapMilisecs: 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21175
  \`\`\`
21176
  [END OF EXAMPLE]
21177
 
21178
- The example above is truncated at index 1, REMEMBER TO CREATE AT LEAST 20 TURNS.
21179
  The output text will be passed to TTS engine, make sure to be clean and natural:
21180
  - Write NUMBER and abbreviations as WORDS, as they are pronounced
21181
  - For some less-common abbreviations, write the full words
21182
- - Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation
21183
- - IMPORTANT!! Write nicknames and names as they are pronounced. For example, "lora_rank=2" becomes "lora rank equals two", or "LoRA" becomes "Lo Ra", or "CrossEntropyLoss" becomes "Cross Entropy Loss", or "6GB" becomes "six gigabytes", "A6000" becomes "A six thousands"
 
 
 
 
 
 
 
 
 
 
21184
 
21185
  Make it engaging and have fun!
21186
 
@@ -27640,6 +27727,21 @@ html {
27640
  --tw-text-opacity: 1;
27641
  color: var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));
27642
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27643
  @media (hover: hover) {
27644
 
27645
  .btm-nav > *.disabled:hover,
@@ -28096,6 +28198,49 @@ html {
28096
  overflow: hidden;
28097
  border-radius: inherit;
28098
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28099
  @keyframes checkmark {
28100
 
28101
  0% {
@@ -28209,6 +28354,14 @@ html {
28209
  .loading-lg {
28210
  width: 2.5rem;
28211
  }
 
 
 
 
 
 
 
 
28212
  .mockup-browser .mockup-browser-toolbar .input {
28213
  position: relative;
28214
  margin-left: auto;
@@ -28829,6 +28982,9 @@ html {
28829
  .items-center {
28830
  align-items: center;
28831
  }
 
 
 
28832
  .gap-4 {
28833
  gap: 1rem;
28834
  }
 
14721
  const pickRand = (arr) => {
14722
  return arr[Math.floor(Math.random() * arr.length)];
14723
  };
14724
+ const joinAudio = (audio1, audio2, gapMilisecs, overlap = "none") => {
14725
  const sampleRate = audio1.sampleRate;
14726
  const numChannels = audio1.numberOfChannels;
14727
  if (audio2.sampleRate !== sampleRate) {
 
14730
  if (audio2.numberOfChannels !== numChannels) {
14731
  throw new Error("Audio buffers must have the same number of channels");
14732
  }
14733
+ const gapSeconds = gapMilisecs / 1e3;
14734
  let newLength;
14735
  if (gapSeconds > 0) {
14736
  const gapSamples = Math.round(gapSeconds * sampleRate);
 
14810
  }
14811
  return newBuffer;
14812
  };
14813
+ const addSilence = (audioBuffer, toBeginning, durationMilisecs) => {
14814
+ const sampleRate = audioBuffer.sampleRate;
14815
+ const silenceSamples = Math.round(durationMilisecs / 1e3 * sampleRate);
14816
+ const numChannels = audioBuffer.numberOfChannels;
14817
+ const originalLength = audioBuffer.length;
14818
+ const newLength = originalLength + silenceSamples;
14819
+ const newBuffer = new AudioBuffer({
14820
+ length: newLength,
14821
+ numberOfChannels: numChannels,
14822
+ sampleRate
14823
+ });
14824
+ for (let channel = 0; channel < numChannels; channel++) {
14825
+ const originalData = audioBuffer.getChannelData(channel);
14826
+ const newData = newBuffer.getChannelData(channel);
14827
+ {
14828
+ newData.set(originalData, silenceSamples);
14829
+ }
14830
+ }
14831
+ return newBuffer;
14832
+ };
14833
  const loadWavAndDecode = async (url) => {
14834
  const response = await fetch(url);
14835
  const arrayBuffer = await response.arrayBuffer();
 
21009
  const [speaker1, setSpeaker1] = reactExports.useState("");
21010
  const [speaker2, setSpeaker2] = reactExports.useState("");
21011
  const [speed, setSpeed] = reactExports.useState("1.2");
21012
+ const [addIntroMusic, setAddIntroMusic] = reactExports.useState(false);
21013
  const setRandSpeaker = () => {
21014
  const { s1, s2 } = getRandomSpeakerPair();
21015
  setSpeaker1(s1);
 
21047
  step.audioBuffer = await loadWavAndDecode(url);
21048
  if (i === 0) {
21049
  outputWav = step.audioBuffer;
21050
+ if (addIntroMusic) {
21051
+ const openingSound = await loadWavAndDecode(openingSoundSrc);
21052
+ outputWav = joinAudio(openingSound, outputWav, -2e3);
21053
+ } else {
21054
+ outputWav = addSilence(outputWav, true, 200);
21055
+ }
21056
  } else {
21057
  const lastStep = steps[i - 1];
21058
  outputWav = joinAudio(
21059
  outputWav,
21060
  step.audioBuffer,
21061
+ lastStep.turn.nextGapMilisecs
21062
  );
21063
  }
21064
  setNumStepsDone(i + 1);
 
21111
  children: SPEAKERS.map((s) => /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: s.value, children: s.name }, s.value))
21112
  }
21113
  )
21114
+ ] }),
 
 
21115
  /* @__PURE__ */ jsxRuntimeExports.jsx("button", { className: "btn", onClick: setRandSpeaker, children: "Randomize speakers" }),
21116
  /* @__PURE__ */ jsxRuntimeExports.jsx("label", { className: "form-control w-full", children: /* @__PURE__ */ jsxRuntimeExports.jsx(
21117
  "select",
 
21127
  ")"
21128
  ] }, s.value))
21129
  }
21130
+ ) }),
21131
+ /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: "flex items-center gap-2", children: [
21132
+ /* @__PURE__ */ jsxRuntimeExports.jsx(
21133
+ "input",
21134
+ {
21135
+ type: "checkbox",
21136
+ className: "checkbox",
21137
+ checked: addIntroMusic,
21138
+ onChange: (e) => setAddIntroMusic(e.target.checked),
21139
+ disabled: isGenerating || busy
21140
+ }
21141
+ ),
21142
+ "Add intro music"
21143
+ ] })
21144
  ] }),
21145
  /* @__PURE__ */ jsxRuntimeExports.jsx(
21146
  "button",
 
21189
  - There can be from 20 to 30 turns in total.
21190
  - First turns should be the introduction for the theme and speakers.
21191
  - The script will be passed to TTS engine, make sure to write plain pronunciation, for example the www. must pronounced like "www dot". Do NOT add anything strange, do NOT add facial expression in the text.
 
21192
  - In the first turn, you must introduce the subject and speakers. Make up a story about the speakers, how they know each other, and why they are talking about the subject.
21193
 
21194
  There is an example (it is truncated):
21195
 
21196
  [START OF EXAMPLE]
21197
  \`\`\`yaml
21198
+ title: "Emerging AI Patterns: A Discussion"
21199
  speakerNames:
21200
+ - "Alex"
21201
+ - "Jordan"
21202
  turns:
 
 
 
 
21203
  - index: 1
21204
+ speakerName: "Alex"
21205
+ text: "Welcome, [everyone](+2)! I'm Alex, and today, we're diving into emerging AI patterns. I'm joined by Jordan, a researcher and technologist. Jordan and I first met at a tech conference, where we bonded over our shared curiosity about AI trends. We've both followed the Thoughtworks Technology Radar for years, so we're excited to break down the latest themes from Volume [thirty](+1)!"
21206
+ nextGapMilisecs: 50
21207
+
21208
+ - index: 2
21209
+ speakerName: "Jordan"
21210
+ text: "That's right! These Radar themes give insight into where tech is headed. One major focus this time? Large [language](+1) models, or L L M. No surprise ... AI is [everywhere](+2). About [thirty-two](+1) percent of the Radar's blips are tied to generative AI. One key theme: emerging architecture patterns for L L M."
21211
+ nextGapMilisecs: 100
21212
+
21213
+ - index: 3
21214
+ speakerName: "Alex"
21215
+ text: "Let's start with an example ... [Chatbots](+1). Businesses are using L L M for customer service, e-Commerce, even legal and medical support. One standout pattern, is retrieval-augmented generation, or Rag. Jordan, can you break that down?"
21216
+ nextGapMilisecs: 100
21217
+
21218
+ - index: 4
21219
+ speakerName: "Jordan"
21220
+ text: "Absolutely! ... Retrieval-augmented generation, or Rag, is about [dynamically](+1) injecting relevant data into prompts, instead of fine-tuning the model itself. Fine-tuning can be [expensive](+1), and often unnecessary. Rag pulls in fresh, specific information, making responses more relevant. If developers focus on just one AI technique, it should probably be [this](+1)."
21221
+ nextGapMilisecs: -100
21222
+
21223
+ - index: 5
21224
+ speakerName: "Alex"
21225
+ text: "That's a perfect segue to another blip: 'Rush to fine-tuning' ... which is on hold. Many assume fine-tuning is always the answer, but is it?"
21226
+ nextGapMilisecs: 50
21227
+
21228
+ - index: 6
21229
+ speakerName: "Jordan"
21230
+ text: "Not necessarily. Fine-tuning works best for adapting writing styles or learning new patterns, but [not](+2) for adding [facts](+1). If you want an L L M to understand company-specific knowledge, fine-tuning isn't always ideal. Rag is a better, more cost-effective choice."
21231
+ nextGapMilisecs: 100
21232
+
21233
+ - index: 7
21234
+ speakerName: "Alex"
21235
+ text: "Now, getting L L M into production isn't just about retrieval. Monitoring and testing are crucial. I remember you mentioning Lang Fuse ... what does it do?"
21236
+ nextGapMilisecs: 100
21237
+
21238
+ - index: 8
21239
+ speakerName: "Jordan"
21240
+ text: "Lang Fuse helps monitor performance, cost, and response quality. AI outputs aren't always predictable, so we need observability tools. Testing AI is tricky ... it's [non](+1) deterministic. That's where [guardrails](+1) come in. For example, Nemo Guardrails by Nvidia helps filter user inputs and model responses for security and ethical concerns."
21241
+ nextGapMilisecs: 100
21242
+
21243
+ - index: 9
21244
+ speakerName: "Alex"
21245
+ text: "Speaking of optimization, another emerging pattern is combining models. Using a lightweight L L M for most tasks, then a more [powerful](+1) one for validation. Smart cost control, right?"
21246
+ nextGapMilisecs: -100
21247
+
21248
+ - index: 10
21249
+ speakerName: "Jordan"
21250
+ text: "Exactly! This idea came up in discussions in South America. Instead of relying on a [single](+1) high-end model, companies are selectively verifying outputs. Balancing cost and performance is crucial when scaling A I applications."
21251
+ nextGapMilisecs: 100
21252
  \`\`\`
21253
  [END OF EXAMPLE]
21254
 
21255
+ The example above is truncated at index 10, REMEMBER TO CREATE AT LEAST 25 TURNS.
21256
  The output text will be passed to TTS engine, make sure to be clean and natural:
21257
  - Write NUMBER and abbreviations as WORDS, as they are pronounced
21258
  - For some less-common abbreviations, write the full words
21259
+ - Use ... for pauses (IMPORTANT to add pauses), " and ' and ! and ? for intonation. Do NOT use dash, use ... instead.
21260
+ - IMPORTANT!! Write nicknames, names, numbers as they are pronounced. For example:
21261
+ - "lora_rank=2" becomes "lora rank equals two"
21262
+ - "LoRA" becomes "Lo Ra"
21263
+ - "CrossEntropyLoss" becomes "Cross Entropy Loss"
21264
+ - "6GB" becomes "six gigabytes"
21265
+ - "A6000" becomes "A six thousands"
21266
+ - "CUDA" becomes "Cu-Da"
21267
+ - (and so on)
21268
+
21269
+ Example of a input text: "Great advice! Thanks, Jordan - That wraps up our discussion. Stay tuned for more deep dives into emerging tech!"
21270
+ Example of output: "Great advice! ... Thanks Jordan! ... That wraps up our discussion. Stay tuned for more deep dives into [emerging](+1) tech!"
21271
 
21272
  Make it engaging and have fun!
21273
 
 
27727
  --tw-text-opacity: 1;
27728
  color: var(--fallback-nc,oklch(var(--nc)/var(--tw-text-opacity)));
27729
  }
27730
+ .checkbox {
27731
+ flex-shrink: 0;
27732
+ --chkbg: var(--fallback-bc,oklch(var(--bc)/1));
27733
+ --chkfg: var(--fallback-b1,oklch(var(--b1)/1));
27734
+ height: 1.5rem;
27735
+ width: 1.5rem;
27736
+ cursor: pointer;
27737
+ -webkit-appearance: none;
27738
+ -moz-appearance: none;
27739
+ appearance: none;
27740
+ border-radius: var(--rounded-btn, 0.5rem);
27741
+ border-width: 1px;
27742
+ border-color: var(--fallback-bc,oklch(var(--bc)/var(--tw-border-opacity)));
27743
+ --tw-border-opacity: 0.2;
27744
+ }
27745
  @media (hover: hover) {
27746
 
27747
  .btm-nav > *.disabled:hover,
 
28198
  overflow: hidden;
28199
  border-radius: inherit;
28200
  }
28201
+ .checkbox:focus {
28202
+ box-shadow: none;
28203
+ }
28204
+ .checkbox:focus-visible {
28205
+ outline-style: solid;
28206
+ outline-width: 2px;
28207
+ outline-offset: 2px;
28208
+ outline-color: var(--fallback-bc,oklch(var(--bc)/1));
28209
+ }
28210
+ .checkbox:disabled {
28211
+ border-width: 0px;
28212
+ cursor: not-allowed;
28213
+ border-color: transparent;
28214
+ --tw-bg-opacity: 1;
28215
+ background-color: var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));
28216
+ opacity: 0.2;
28217
+ }
28218
+ .checkbox:checked,
28219
+ .checkbox[aria-checked="true"] {
28220
+ background-repeat: no-repeat;
28221
+ animation: checkmark var(--animation-input, 0.2s) ease-out;
28222
+ background-color: var(--chkbg);
28223
+ background-image: linear-gradient(-45deg, transparent 65%, var(--chkbg) 65.99%),
28224
+ linear-gradient(45deg, transparent 75%, var(--chkbg) 75.99%),
28225
+ linear-gradient(-45deg, var(--chkbg) 40%, transparent 40.99%),
28226
+ linear-gradient(
28227
+ 45deg,
28228
+ var(--chkbg) 30%,
28229
+ var(--chkfg) 30.99%,
28230
+ var(--chkfg) 40%,
28231
+ transparent 40.99%
28232
+ ),
28233
+ linear-gradient(-45deg, var(--chkfg) 50%, var(--chkbg) 50.99%);
28234
+ }
28235
+ .checkbox:indeterminate {
28236
+ --tw-bg-opacity: 1;
28237
+ background-color: var(--fallback-bc,oklch(var(--bc)/var(--tw-bg-opacity)));
28238
+ background-repeat: no-repeat;
28239
+ animation: checkmark var(--animation-input, 0.2s) ease-out;
28240
+ background-image: linear-gradient(90deg, transparent 80%, var(--chkbg) 80%),
28241
+ linear-gradient(-90deg, transparent 80%, var(--chkbg) 80%),
28242
+ linear-gradient(0deg, var(--chkbg) 43%, var(--chkfg) 43%, var(--chkfg) 57%, var(--chkbg) 57%);
28243
+ }
28244
  @keyframes checkmark {
28245
 
28246
  0% {
 
28354
  .loading-lg {
28355
  width: 2.5rem;
28356
  }
28357
+ :where(.menu li:not(.menu-title, .disabled) > *:not(ul, details, .menu-title)):not(summary, .active, .btn).focus, :where(.menu li:not(.menu-title, .disabled) > *:not(ul, details, .menu-title)):not(summary, .active, .btn):focus, :where(.menu li:not(.menu-title, .disabled) > *:not(ul, details, .menu-title)):is(summary):not(.active, .btn):focus-visible, :where(.menu li:not(.menu-title, .disabled) > details > summary:not(.menu-title)):not(summary, .active, .btn).focus, :where(.menu li:not(.menu-title, .disabled) > details > summary:not(.menu-title)):not(summary, .active, .btn):focus, :where(.menu li:not(.menu-title, .disabled) > details > summary:not(.menu-title)):is(summary):not(.active, .btn):focus-visible {
28358
+ cursor: pointer;
28359
+ background-color: var(--fallback-bc,oklch(var(--bc)/0.1));
28360
+ --tw-text-opacity: 1;
28361
+ color: var(--fallback-bc,oklch(var(--bc)/var(--tw-text-opacity)));
28362
+ outline: 2px solid transparent;
28363
+ outline-offset: 2px;
28364
+ }
28365
  .mockup-browser .mockup-browser-toolbar .input {
28366
  position: relative;
28367
  margin-left: auto;
 
28982
  .items-center {
28983
  align-items: center;
28984
  }
28985
+ .gap-2 {
28986
+ gap: 0.5rem;
28987
+ }
28988
  .gap-4 {
28989
  gap: 1rem;
28990
  }