Spaces:
Sleeping
Sleeping
File size: 5,523 Bytes
f24ad59 58b1ffb f24ad59 8919651 f24ad59 8919651 f24ad59 d2f7c95 f24ad59 d2f7c95 f24ad59 d2f7c95 8919651 f24ad59 5513dc6 f24ad59 5513dc6 f24ad59 5513dc6 d2f7c95 f24ad59 58b1ffb f24ad59 5513dc6 d2f7c95 5513dc6 58b1ffb 5513dc6 8919651 5513dc6 8919651 5513dc6 8919651 f24ad59 58b1ffb f24ad59 8919651 58b1ffb f24ad59 58b1ffb f24ad59 58b1ffb f24ad59 58b1ffb f24ad59 58b1ffb f24ad59 90266e1 f24ad59 e864e26 f24ad59 e864e26 f24ad59 90266e1 f24ad59 d4a1dc1 58b1ffb d4a1dc1 f24ad59 90266e1 f24ad59 d4a1dc1 f24ad59 d4a1dc1 f24ad59 90266e1 f24ad59 d4a1dc1 8919651 d4a1dc1 f24ad59 90266e1 f24ad59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
"use server"
import { ClapProject, getValidNumber, newClap, newSegment, ClapSegmentCategory, ClapOutputType, ClapMediaOrientation } from "@aitube/clap"
import { sleep } from "@/lib/utils/sleep"
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
import { LatentStory } from "@/app/api/v1/types"
import { systemPrompt } from "./systemPrompt"
// a helper to generate Clap stories from a few sentences
// this is mostly used by external apps such as the Stories Factory
export async function create(request: {
prompt?: string
width?: number
height?: number
turbo?: boolean
}= {
prompt: "",
width: 1024,
height: 576,
turbo: false,
}): Promise<ClapProject> {
const prompt = `${request?.prompt || ""}`.trim()
console.log("api/v1/create(): request:", request)
if (!prompt.length) { throw new Error(`please provide a prompt`) }
const width = getValidNumber(request?.width, 256, 8192, 1024)
const height = getValidNumber(request?.height, 256, 8192, 576)
const turbo = request?.turbo ? true : false
const userPrompt = `Movie story to generate: ${prompt}
Output: `
const prefix = "```yaml\n"
const nbMaxNewTokens = 1400
// TODO use streaming for the Hugging Face prediction
//
// note that a Clap file is actually a YAML stream of documents
// so technically we could stream everything from end-to-end
// (but I haven't coded the helpers to do this yet)
let rawString = await predict({
systemPrompt,
userPrompt,
nbMaxNewTokens,
prefix,
turbo,
})
// console.log("api/v1/create(): rawString: ", rawString)
let shots: LatentStory[] = []
let maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
console.log(`api/v1/create(): failed to generate shots.. trying again`)
await sleep(2000)
rawString = await predict({
systemPrompt,
userPrompt: userPrompt + ".", // we trick the Hugging Face cache
nbMaxNewTokens,
prefix,
turbo,
})
// console.log("api/v1/create(): rawString: ", rawString)
maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
}
}
if (maybeShots.length) {
shots = maybeShots
} else {
throw new Error(`Hugging Face Inference API failure (the model failed to generate the shots)`)
}
console.log(`api/v1/create(): generated ${shots.length} shots`)
// this is approximate - TTS generation will determine the final duration of each shot
const defaultSegmentDurationInMs = 7000
let currentElapsedTimeInMs = 0
const clap: ClapProject = newClap({
meta: {
title: prompt.split(",").shift() || "",
description: prompt,
synopsis: "",
licence: "",
orientation:
width > height ? ClapMediaOrientation.LANDSCAPE :
height > width ? ClapMediaOrientation.PORTRAIT :
ClapMediaOrientation.SQUARE,
width,
height,
isInteractive: false,
isLoop: false,
durationInMs: shots.length * defaultSegmentDurationInMs,
defaultVideoModel: "AnimateDiff-Lightning",
}
})
for (const { comment, image, voice } of shots) {
console.log(`api/v1/create(): - ${comment}`)
// note: it would be nice if we could have a convention saying that
// track 0 is for videos and track 1 storyboards
//
// however, that's a bit constraining as people will generate .clap
// using all kind of tools and development experience,
// and they may not wish to learn the Clap protocol format completely
//
// TL;DR:
// we should fix the Clap file editor to make it able to react videos
// from any track number
clap.segments.push(newSegment({
track: 0,
startTimeInMs: currentElapsedTimeInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.VIDEO,
prompt: image,
outputType: ClapOutputType.VIDEO,
}))
clap.segments.push(newSegment({
track: 1,
startTimeInMs: currentElapsedTimeInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.STORYBOARD,
prompt: image,
outputType: ClapOutputType.IMAGE,
}))
clap.segments.push(newSegment({
track: 2,
startTimeInMs: currentElapsedTimeInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.INTERFACE,
prompt: comment,
// assetUrl: `data:text/plain;base64,${btoa(comment)}`,
assetUrl: comment,
outputType: ClapOutputType.TEXT,
}))
clap.segments.push(newSegment({
track: 3,
startTimeInMs: currentElapsedTimeInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.DIALOGUE,
prompt: voice,
outputType: ClapOutputType.AUDIO,
}))
// the presence of a camera is mandatory
clap.segments.push(newSegment({
track: 4,
startTimeInMs: currentElapsedTimeInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.CAMERA,
prompt: "video",
outputType: ClapOutputType.TEXT,
}))
currentElapsedTimeInMs += defaultSegmentDurationInMs
}
return clap
}
|