ai-lab-tube

Sleeping

File size: 5,984 Bytes


import { ClapProject, ClapSegmentCategory, getClapAssetSourceType, getValidNumber, newEntity } from "@aitube/clap"
import { ClapCompletionMode, ClapEntityPrompt } from "@aitube/client"

import { generateImageID } from "./generateImageID"
import { generateAudioID } from "./generateAudioID"
import { generateEntityPrompts } from "./generateEntityPrompts"
import { clapToLatentStory } from "./clapToLatentStory"

export async function editEntities({
  existingClap,
  newerClap,
  entityPrompts = [],
  mode = ClapCompletionMode.PARTIAL,
  turbo = false,
}: {
  existingClap: ClapProject
  newerClap: ClapProject
  entityPrompts?: ClapEntityPrompt[]
  mode?: ClapCompletionMode
  turbo?: boolean
}) {

  // note that we can only handle either FULL or PARTIAL
  // other modes such as MERGE, REPLACE.. are irrelevant since those are client-side modes
  // so from a server point of view those correspond to PARTIAL
  //
  // it is also worth noting that the use of FULL should be discouraged
  const isFull = mode === ClapCompletionMode.FULL
  const isPartial = !isFull

  // if we don't have existing entities, and user passed none,
  // then we need to hallucinate them
  if (existingClap.entities.length === 0 && entityPrompts.length === 0) {

    const entityPromptsWithShots = await generateEntityPrompts({
      prompt: existingClap.meta.description,
      latentStory: await clapToLatentStory(existingClap),
      turbo,
    })

    const allShots = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)

    for (const {
      entityPrompt: { name, category, age, variant, region, identityImage, identityVoice },
      shots: entityShots
    } of entityPromptsWithShots) {
      const newEnt = newEntity({
        category,
        triggerName: name,
        label: name,
        description: name,
        author: "auto",
        thumbnailUrl: "",
  
        imagePrompt: "",
        imageSourceType: getClapAssetSourceType(identityImage),
        imageEngine: "SD Lightning", 
        imageId: identityImage,
        audioPrompt: "",
        audioSourceType: getClapAssetSourceType(identityVoice),
        audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized
        audioId: identityVoice,
  
        // note: using a numeric age should be deprecated,
        // instead we should be able to specify things using text,
        // eg. "8 months", "25 years old", "12th century"
        age: getValidNumber(age, 0, 120, 25),
   
        // TODO: delete gender and appearance, replace by a single concept of "variant"
        gender: "",
        appearance: variant,
        region: region,
      })
      
      existingClap.entities.push(newEnt)

      // now let's assign our entity to shots!
      //
      // warning: the shot assignment is the responsibility of the LLM.
      // if the LLM hallucinates non-existing shot ids, it will cause trouble!
      for (const shotId of entityShots) {
        if (allShots[shotId]) {
          allShots[shotId].entityId = newEnt.id
        } else {
          console.log(`[api/v1/edit/entities] warning: the LLM generated a non-existing shot (shot "${shotId}", but we only have ${allShots.length} shots)`)
        }
      }
    }
  }

  // otherwise try to add what's new
  for (const { name, category, age, variant, region, identityImage, identityVoice } of entityPrompts) {
    const newEnt = newEntity({
      category,
      triggerName: name,
      label: name,
      description: name,
      author: "auto",
      thumbnailUrl: "",

      imagePrompt: "",
      imageSourceType: getClapAssetSourceType(identityImage),
      imageEngine: "SD Lightning", 
      imageId: identityImage,
      audioPrompt: "",
      audioSourceType: getClapAssetSourceType(identityVoice),
      audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized
      audioId: identityVoice,

      // note: using a numeric age should be deprecated,
      // instead we should be able to specify things using text,
      // eg. "8 months", "25 years old", "12th century"
      age: getValidNumber(age, 0, 120, 25),
 
      // TODO: delete gender and appearance, replace by a single concept of "variant"
      gender: "",
      appearance: variant,
      region: region,
    })
    
    existingClap.entities.push(newEnt)
  }

  if (!existingClap.entities.length) { throw new Error(`please provide at least one entity`) }

  // then we try to automatically repair, edit, complete.. all the existing entities

  for (const entity of existingClap.entities) {

    let entityHasBeenModified = false

    // TASK 1: GENERATE THE IMAGE PROMPT IF MISSING
    if (!entity.imagePrompt) {
      entity.imagePrompt = "a man with a beard"
      entityHasBeenModified = true
    }

    // TASK 2: GENERATE THE IMAGE ID IF MISSING
    if (!entity.imageId) {
      entity.imageId = await generateImageID({
        prompt: entity.imagePrompt,
        seed: entity.seed,
        turbo,
      })
      entity.imageSourceType = getClapAssetSourceType(entity.imageId)
      entityHasBeenModified = true
    }

    // TASK 3: GENERATE THE AUDIO PROMPT IF MISSING
    if (!entity.audioPrompt) {
      entity.audioPrompt = "a man with a beard"
      entityHasBeenModified = true
    }

    // TASK 4: GENERATE THE AUDIO ID IF MISSING

    // TODO here: call Parler-TTS or a generic audio generator
    if (!entity.audioId) {
      entity.audioId = await generateAudioID({
        prompt: entity.audioPrompt,
        seed: entity.seed
      })
      entity.audioSourceType = getClapAssetSourceType(entity.audioId)
      entityHasBeenModified = true
    }

    // in case we are doing a partial update
    if (mode !== ClapCompletionMode.FULL && entityHasBeenModified && !newerClap.entityIndex[entity.id]) {
      newerClap.entities.push(entity)
      newerClap.entityIndex[entity.id] = entity
    }
  }

  // console.log(`api/edit/entities(): returning the newerClap`)

  return newerClap
}