File size: 4,084 Bytes
8919651
 
 
6d66622
8919651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4a1dc1
 
8919651
 
 
d4a1dc1
8919651
 
d4a1dc1
 
8919651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4a1dc1
8919651
 
58b1ffb
8919651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4a1dc1
8919651
 
58b1ffb
8919651
 
 
 
 
 
 
 
6d66622
 
 
 
 
 
 
 
8919651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4a1dc1
 
8919651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58b1ffb
8919651
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"use server"

import YAML from "yaml"
import { ClapSegmentCategory, generateSeed } from "@aitube/clap"
import { ClapEntityPrompt } from "@aitube/client"

import { sleep } from "@/lib/utils/sleep"
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
import { LatentEntity, LatentStory } from "@/app/api/v1/types"

import { systemPrompt } from "./systemPrompt"
import { generateImageID } from "./generateImageID"

export type EntityPromptResult = {
  entityPrompt: ClapEntityPrompt
  shots: number[]
}

// a helper to generate Clap stories from a few sentences
// this is mostly used by external apps such as the Stories Factory
export async function generateEntityPrompts({
  prompt = "",
  latentStory = [],
  turbo = false,
}: {
  prompt?: string
  latentStory?: LatentStory[]
  turbo?: boolean
} = {
  prompt: "",
  latentStory: [],
  turbo: false
}): Promise<EntityPromptResult[]> {

  if (!prompt.length) { throw new Error(`please provide a prompt`) }
  console.log("generateEntityPrompts(): prompt:", prompt)


  if (!latentStory.length) { throw new Error(`please provide a story`) }

  console.log("generateEntityPrompts(): latentStory:", latentStory)

  const userPrompt = `The input story is about: ${prompt}.

The input story timeline is:
\`\`\`yaml
${YAML.stringify(
  // we need to help the LLM by marking the shots with a simple numeric ID
  latentStory.map((shot, i) => ({
    shot: i,
    ...shot,
  }))
)}
\`\`\`

Now please generate the output entities:`

  const prefix = "```yaml\n"
  const nbMaxNewTokens = 1400

  // TODO use streaming for the Hugging Face prediction
  //
  // note that a Clap file is actually a YAML stream of documents
  // so technically we could stream everything from end-to-end
  // (but I haven't coded the helpers to do this yet)
  let rawString = await predict({
    systemPrompt,
    userPrompt,
    nbMaxNewTokens,
    prefix,
    turbo,
  })

  // console.log("generateEntityPrompts(): rawString: ", rawString)

  let results: EntityPromptResult[] = []
  
  let maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])

  if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
    console.log(`generateEntityPrompts(): failed to generate entities.. trying again`)
    
    await sleep(2000)

    rawString = await predict({
      systemPrompt,
      userPrompt: userPrompt + ".", // we trick the Hugging Face cache
      nbMaxNewTokens,
      prefix,
      turbo,
    })
  
    // console.log("generateEntityPrompts(): rawString: ", rawString)
  
    maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
    if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
      console.log(`generateEntityPrompts(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
    }
  }

  if (maybeEntities.length) {
    results = await Promise.all(
      maybeEntities

      // the LLM generates unrelated catrgories unfortunately,
      // that we still turn into image.. so we fix that by filtering
      .filter(({ category }) => category !== ClapSegmentCategory.CHARACTER)

      .map(async ({
      name,
      category,
      image,
      audio,
      shots,
    }) => {

      const entityPrompt: ClapEntityPrompt = {
        name,
        category,
        age: "",
        variant: image,
        region: "",
        identityImage: await generateImageID({
          prompt: image,
          seed: generateSeed(),
          turbo,
        }),

        // TODO later 
        identityVoice: "" // await generateAudioID({ prompt: e.audio, seed: generateSeed() })
      }

      const result: EntityPromptResult = {
        entityPrompt,
        shots
      }

      return result
    }))
  } else {
    throw new Error(`Hugging Face Inference API failure (the model failed to generate the entities)`)
  }

  // console.log(`generateEntityPrompts(): generated ${results.length} entities with their images and voice ids`)

  return results
}