File size: 2,952 Bytes
8e3d721
8919651
 
 
 
 
58b1ffb
 
8919651
 
8e3d721
 
 
8c7d08c
8e3d721
 
 
f24ad59
 
db70195
 
8e3d721
 
f24ad59
 
 
db70195
8e3d721
 
6419aeb
58b1ffb
6419aeb
f24ad59
8e3d721
6419aeb
8e3d721
58b1ffb
8e3d721
 
 
 
6419aeb
8e3d721
 
6419aeb
8e3d721
 
 
 
 
f24ad59
 
 
58b1ffb
f24ad59
 
8e3d721
 
 
 
8c7d08c
 
3b780fb
 
 
 
 
 
f24ad59
3b780fb
 
 
8c7d08c
8e3d721
6419aeb
8e3d721
 
 
6419aeb
f24ad59
 
8919651
f24ad59
 
8e3d721
6419aeb
8e3d721
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

import {
  ClapProject,
  ClapSegment,
  getClapAssetSourceType,
  filterSegments,
  ClapSegmentFilteringMode,
  ClapSegmentCategory
} from "@aitube/clap"
import { ClapCompletionMode } from "@aitube/client"
import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"

import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
import { getMediaInfo } from "@/app/api/utils/getMediaInfo"

export async function processShot({
  shotSegment,
  existingClap,
  newerClap,
  mode,
  turbo,
}: {
  shotSegment: ClapSegment
  existingClap: ClapProject
  newerClap: ClapProject
  mode: ClapCompletionMode
  turbo: boolean
}): Promise<void> {

  const shotSegments: ClapSegment[] = filterSegments(
    ClapSegmentFilteringMode.BOTH,
    shotSegment,
    existingClap.segments
  )
  
  const shotDialogueSegments: ClapSegment[] = shotSegments.filter(s =>
    s.category === ClapSegmentCategory.DIALOGUE
  )

  let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
  
  console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)

  if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
    // console.log(`[api/edit/dialogues] generating audio..`)

    try {
      // this generates a mp3
      shotDialogueSegment.assetUrl = await generateSpeechWithParlerTTS({
        text: shotDialogueSegment.prompt,
        audioId: getSpeechBackgroundAudioPrompt(
          shotSegments,
          existingClap.entityIndex,
          // TODO: use the entity description if it exists
          ["high quality", "crisp", "detailed"]
        ),
        debug: true,
      })
      shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)

      const { durationInMs, durationInSec, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
      
      if (hasAudio && durationInMs > 1000) {
        shotDialogueSegment.assetDurationInMs = durationInMs
        shotSegment.assetDurationInMs = durationInMs

        // we update the duration of all the segments for this shot
        // (it is possible that this makes the two previous lines redundant)
        existingClap.segments.forEach(s => {
          s.assetDurationInMs = durationInMs
        })
      }

    } catch (err) {
      console.log(`[api/edit/dialogues] processShot: failed to generate audio: ${err}`)
      throw err
    }

    console.log(`[api/edit/dialogues] processShot: generated dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)

  // if it's partial, we need to manually add it
  if (mode !== ClapCompletionMode.FULL) {
      newerClap.segments.push(shotDialogueSegment)
    }
  } else {
    console.log(`[api/edit/dialogues] processShot: there is already a dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
  }
}