Spaces:
Sleeping
Sleeping
import { ClapProject, ClapSegmentCategory, getClapAssetSourceType, getValidNumber, newEntity } from "@aitube/clap" | |
import { ClapCompletionMode, ClapEntityPrompt } from "@aitube/client" | |
import { generateImageID } from "./generateImageID" | |
import { generateAudioID } from "./generateAudioID" | |
import { generateEntityPrompts } from "./generateEntityPrompts" | |
import { clapToLatentStory } from "./clapToLatentStory" | |
export async function editEntities({ | |
existingClap, | |
newerClap, | |
entityPrompts = [], | |
mode = ClapCompletionMode.PARTIAL, | |
turbo = false, | |
}: { | |
existingClap: ClapProject | |
newerClap: ClapProject | |
entityPrompts?: ClapEntityPrompt[] | |
mode?: ClapCompletionMode | |
turbo?: boolean | |
}) { | |
// note that we can only handle either FULL or PARTIAL | |
// other modes such as MERGE, REPLACE.. are irrelevant since those are client-side modes | |
// so from a server point of view those correspond to PARTIAL | |
// | |
// it is also worth noting that the use of FULL should be discouraged | |
const isFull = mode === ClapCompletionMode.FULL | |
const isPartial = !isFull | |
// if we don't have existing entities, and user passed none, | |
// then we need to hallucinate them | |
if (existingClap.entities.length === 0 && entityPrompts.length === 0) { | |
const entityPromptsWithShots = await generateEntityPrompts({ | |
prompt: existingClap.meta.description, | |
latentStory: await clapToLatentStory(existingClap), | |
turbo, | |
}) | |
const allShots = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA) | |
for (const { | |
entityPrompt: { name, category, age, variant, region, identityImage, identityVoice }, | |
shots: entityShots | |
} of entityPromptsWithShots) { | |
const newEnt = newEntity({ | |
category, | |
triggerName: name, | |
label: name, | |
description: name, | |
author: "auto", | |
thumbnailUrl: "", | |
imagePrompt: "", | |
imageSourceType: getClapAssetSourceType(identityImage), | |
imageEngine: "SD Lightning", | |
imageId: identityImage, | |
audioPrompt: "", | |
audioSourceType: getClapAssetSourceType(identityVoice), | |
audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized | |
audioId: identityVoice, | |
// note: using a numeric age should be deprecated, | |
// instead we should be able to specify things using text, | |
// eg. "8 months", "25 years old", "12th century" | |
age: getValidNumber(age, 0, 120, 25), | |
// TODO: delete gender and appearance, replace by a single concept of "variant" | |
gender: "", | |
appearance: variant, | |
region: region, | |
}) | |
existingClap.entities.push(newEnt) | |
// now let's assign our entity to shots! | |
// | |
// warning: the shot assignment is the responsibility of the LLM. | |
// if the LLM hallucinates non-existing shot ids, it will cause trouble! | |
for (const shotId of entityShots) { | |
if (allShots[shotId]) { | |
allShots[shotId].entityId = newEnt.id | |
} else { | |
console.log(`[api/v1/edit/entities] warning: the LLM generated a non-existing shot (shot "${shotId}", but we only have ${allShots.length} shots)`) | |
} | |
} | |
} | |
} | |
// otherwise try to add what's new | |
for (const { name, category, age, variant, region, identityImage, identityVoice } of entityPrompts) { | |
const newEnt = newEntity({ | |
category, | |
triggerName: name, | |
label: name, | |
description: name, | |
author: "auto", | |
thumbnailUrl: "", | |
imagePrompt: "", | |
imageSourceType: getClapAssetSourceType(identityImage), | |
imageEngine: "SD Lightning", | |
imageId: identityImage, | |
audioPrompt: "", | |
audioSourceType: getClapAssetSourceType(identityVoice), | |
audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized | |
audioId: identityVoice, | |
// note: using a numeric age should be deprecated, | |
// instead we should be able to specify things using text, | |
// eg. "8 months", "25 years old", "12th century" | |
age: getValidNumber(age, 0, 120, 25), | |
// TODO: delete gender and appearance, replace by a single concept of "variant" | |
gender: "", | |
appearance: variant, | |
region: region, | |
}) | |
existingClap.entities.push(newEnt) | |
} | |
if (!existingClap.entities.length) { throw new Error(`please provide at least one entity`) } | |
// then we try to automatically repair, edit, complete.. all the existing entities | |
for (const entity of existingClap.entities) { | |
let entityHasBeenModified = false | |
// TASK 1: GENERATE THE IMAGE PROMPT IF MISSING | |
if (!entity.imagePrompt) { | |
entity.imagePrompt = "a man with a beard" | |
entityHasBeenModified = true | |
} | |
// TASK 2: GENERATE THE IMAGE ID IF MISSING | |
if (!entity.imageId) { | |
entity.imageId = await generateImageID({ | |
prompt: entity.imagePrompt, | |
seed: entity.seed, | |
turbo, | |
}) | |
entity.imageSourceType = getClapAssetSourceType(entity.imageId) | |
entityHasBeenModified = true | |
} | |
// TASK 3: GENERATE THE AUDIO PROMPT IF MISSING | |
if (!entity.audioPrompt) { | |
entity.audioPrompt = "a man with a beard" | |
entityHasBeenModified = true | |
} | |
// TASK 4: GENERATE THE AUDIO ID IF MISSING | |
// TODO here: call Parler-TTS or a generic audio generator | |
if (!entity.audioId) { | |
entity.audioId = await generateAudioID({ | |
prompt: entity.audioPrompt, | |
seed: entity.seed | |
}) | |
entity.audioSourceType = getClapAssetSourceType(entity.audioId) | |
entityHasBeenModified = true | |
} | |
// in case we are doing a partial update | |
if (mode !== ClapCompletionMode.FULL && entityHasBeenModified && !newerClap.entityIndex[entity.id]) { | |
newerClap.entities.push(entity) | |
newerClap.entityIndex[entity.id] = entity | |
} | |
} | |
// console.log(`api/edit/entities(): returning the newerClap`) | |
return newerClap | |
} | |