jbilcke-hf's picture
jbilcke-hf HF Staff
improve prompts
58b1ffb
import { ClapProject, ClapSegmentCategory, getClapAssetSourceType, getValidNumber, newEntity } from "@aitube/clap"
import { ClapCompletionMode, ClapEntityPrompt } from "@aitube/client"
import { generateImageID } from "./generateImageID"
import { generateAudioID } from "./generateAudioID"
import { generateEntityPrompts } from "./generateEntityPrompts"
import { clapToLatentStory } from "./clapToLatentStory"
export async function editEntities({
existingClap,
newerClap,
entityPrompts = [],
mode = ClapCompletionMode.PARTIAL,
turbo = false,
}: {
existingClap: ClapProject
newerClap: ClapProject
entityPrompts?: ClapEntityPrompt[]
mode?: ClapCompletionMode
turbo?: boolean
}) {
// note that we can only handle either FULL or PARTIAL
// other modes such as MERGE, REPLACE.. are irrelevant since those are client-side modes
// so from a server point of view those correspond to PARTIAL
//
// it is also worth noting that the use of FULL should be discouraged
const isFull = mode === ClapCompletionMode.FULL
const isPartial = !isFull
// if we don't have existing entities, and user passed none,
// then we need to hallucinate them
if (existingClap.entities.length === 0 && entityPrompts.length === 0) {
const entityPromptsWithShots = await generateEntityPrompts({
prompt: existingClap.meta.description,
latentStory: await clapToLatentStory(existingClap),
turbo,
})
const allShots = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
for (const {
entityPrompt: { name, category, age, variant, region, identityImage, identityVoice },
shots: entityShots
} of entityPromptsWithShots) {
const newEnt = newEntity({
category,
triggerName: name,
label: name,
description: name,
author: "auto",
thumbnailUrl: "",
imagePrompt: "",
imageSourceType: getClapAssetSourceType(identityImage),
imageEngine: "SD Lightning",
imageId: identityImage,
audioPrompt: "",
audioSourceType: getClapAssetSourceType(identityVoice),
audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized
audioId: identityVoice,
// note: using a numeric age should be deprecated,
// instead we should be able to specify things using text,
// eg. "8 months", "25 years old", "12th century"
age: getValidNumber(age, 0, 120, 25),
// TODO: delete gender and appearance, replace by a single concept of "variant"
gender: "",
appearance: variant,
region: region,
})
existingClap.entities.push(newEnt)
// now let's assign our entity to shots!
//
// warning: the shot assignment is the responsibility of the LLM.
// if the LLM hallucinates non-existing shot ids, it will cause trouble!
for (const shotId of entityShots) {
if (allShots[shotId]) {
allShots[shotId].entityId = newEnt.id
} else {
console.log(`[api/v1/edit/entities] warning: the LLM generated a non-existing shot (shot "${shotId}", but we only have ${allShots.length} shots)`)
}
}
}
}
// otherwise try to add what's new
for (const { name, category, age, variant, region, identityImage, identityVoice } of entityPrompts) {
const newEnt = newEntity({
category,
triggerName: name,
label: name,
description: name,
author: "auto",
thumbnailUrl: "",
imagePrompt: "",
imageSourceType: getClapAssetSourceType(identityImage),
imageEngine: "SD Lightning",
imageId: identityImage,
audioPrompt: "",
audioSourceType: getClapAssetSourceType(identityVoice),
audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized
audioId: identityVoice,
// note: using a numeric age should be deprecated,
// instead we should be able to specify things using text,
// eg. "8 months", "25 years old", "12th century"
age: getValidNumber(age, 0, 120, 25),
// TODO: delete gender and appearance, replace by a single concept of "variant"
gender: "",
appearance: variant,
region: region,
})
existingClap.entities.push(newEnt)
}
if (!existingClap.entities.length) { throw new Error(`please provide at least one entity`) }
// then we try to automatically repair, edit, complete.. all the existing entities
for (const entity of existingClap.entities) {
let entityHasBeenModified = false
// TASK 1: GENERATE THE IMAGE PROMPT IF MISSING
if (!entity.imagePrompt) {
entity.imagePrompt = "a man with a beard"
entityHasBeenModified = true
}
// TASK 2: GENERATE THE IMAGE ID IF MISSING
if (!entity.imageId) {
entity.imageId = await generateImageID({
prompt: entity.imagePrompt,
seed: entity.seed,
turbo,
})
entity.imageSourceType = getClapAssetSourceType(entity.imageId)
entityHasBeenModified = true
}
// TASK 3: GENERATE THE AUDIO PROMPT IF MISSING
if (!entity.audioPrompt) {
entity.audioPrompt = "a man with a beard"
entityHasBeenModified = true
}
// TASK 4: GENERATE THE AUDIO ID IF MISSING
// TODO here: call Parler-TTS or a generic audio generator
if (!entity.audioId) {
entity.audioId = await generateAudioID({
prompt: entity.audioPrompt,
seed: entity.seed
})
entity.audioSourceType = getClapAssetSourceType(entity.audioId)
entityHasBeenModified = true
}
// in case we are doing a partial update
if (mode !== ClapCompletionMode.FULL && entityHasBeenModified && !newerClap.entityIndex[entity.id]) {
newerClap.entities.push(entity)
newerClap.entityIndex[entity.id] = entity
}
}
// console.log(`api/edit/entities(): returning the newerClap`)
return newerClap
}