import { ClapProject, ClapSegment, getClapAssetSourceType, filterSegments, ClapSegmentFilteringMode, ClapSegmentCategory } from "@aitube/clap" import { ClapCompletionMode } from "@aitube/client" import { getSpeechBackgroundAudioPrompt } from "@aitube/engine" import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS" import { getMediaInfo } from "@/app/api/utils/getMediaInfo" export async function processShot({ shotSegment, existingClap, newerClap, mode, turbo, }: { shotSegment: ClapSegment existingClap: ClapProject newerClap: ClapProject mode: ClapCompletionMode turbo: boolean }): Promise { const shotSegments: ClapSegment[] = filterSegments( ClapSegmentFilteringMode.BOTH, shotSegment, existingClap.segments ) const shotDialogueSegments: ClapSegment[] = shotSegments.filter(s => s.category === ClapSegmentCategory.DIALOGUE ) let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0) console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`) if (shotDialogueSegment && !shotDialogueSegment.assetUrl) { // console.log(`[api/edit/dialogues] generating audio..`) try { // this generates a mp3 shotDialogueSegment.assetUrl = await generateSpeechWithParlerTTS({ text: shotDialogueSegment.prompt, audioId: getSpeechBackgroundAudioPrompt( shotSegments, existingClap.entityIndex, // TODO: use the entity description if it exists ["high quality", "crisp", "detailed"] ), debug: true, }) shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl) const { durationInMs, durationInSec, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl) if (hasAudio && durationInMs > 1000) { shotDialogueSegment.assetDurationInMs = durationInMs shotSegment.assetDurationInMs = durationInMs // we update the duration of all the segments for this shot // (it is possible that this makes the two previous lines redundant) existingClap.segments.forEach(s => { s.assetDurationInMs = durationInMs }) } } catch (err) { console.log(`[api/edit/dialogues] processShot: failed to generate audio: ${err}`) throw err } console.log(`[api/edit/dialogues] processShot: generated dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`) // if it's partial, we need to manually add it if (mode !== ClapCompletionMode.FULL) { newerClap.segments.push(shotDialogueSegment) } } else { console.log(`[api/edit/dialogues] processShot: there is already a dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`) } }