Spaces:

jbilcke-hf
/

VideoChain-API

Paused

App Files Files Community

Julian Bilcke commited on Jul 17, 2023

Commit

a65e95e

1 Parent(s): 29598d1

work in progress to create the video service

Browse files

Files changed (19) hide show

Dockerfile +1 -1
package-lock.json +9 -0
package.json +2 -0
src/data/all_words.json +0 -0
src/data/good_words.json +0 -0
src/index.mts +47 -61
src/services/addAudioToVideo.mts +31 -15
src/services/generateActor.mts +50 -0
src/services/generateAudio.mts +45 -22
src/services/generateAudioLegacy.mts +33 -0
src/services/generateShot.mts +209 -0
src/services/generateVideo.mts +2 -1
src/services/generateVoice.mts +56 -0
src/services/interpolateVideo.mts +35 -22
src/services/interpolateVideoLegacy.mts +39 -0
src/services/mergeAudio.mts +49 -0
src/services/postInterpolation.mts +57 -0
src/test2.mts +7 -0
src/types.mts +23 -2

Dockerfile CHANGED Viewed

@@ -30,6 +30,6 @@ RUN npm install
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
-EXPOSE 7860 1935 8000
 CMD [ "npm", "run", "start" ]

 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
+EXPOSE 7860
 CMD [ "npm", "run", "start" ]

package-lock.json CHANGED Viewed

@@ -10,6 +10,7 @@
       "license": "Apache License",
       "dependencies": {
         "@gradio/client": "^0.1.4",
         "@types/express": "^4.17.17",
         "@types/uuid": "^9.0.2",
         "express": "^4.18.2",
@@ -78,6 +79,14 @@
         "node": ">=18.0.0"
       }
     },
     "node_modules/@jridgewell/resolve-uri": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.1.tgz",

       "license": "Apache License",
       "dependencies": {
         "@gradio/client": "^0.1.4",
+        "@huggingface/inference": "^2.6.1",
         "@types/express": "^4.17.17",
         "@types/uuid": "^9.0.2",
         "express": "^4.18.2",
         "node": ">=18.0.0"
       }
     },
+    "node_modules/@huggingface/inference": {
+      "version": "2.6.1",
+      "resolved": "https://registry.npmjs.org/@huggingface/inference/-/inference-2.6.1.tgz",
+      "integrity": "sha512-qFYchgOCPeEkZJKiSr7Kz62QwukJtgkeQCT7Q0SSKUcvHpTQVNJp6i/JrJMR4dBdzQysJ1SZDC0pLBBnnskTag==",
+      "engines": {
+        "node": ">=18"
+      }
+    },
     "node_modules/@jridgewell/resolve-uri": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.1.tgz",

package.json CHANGED Viewed

@@ -6,6 +6,7 @@
   "scripts": {
     "start": "node --loader ts-node/esm src/index.mts",
     "test": "node --loader ts-node/esm src/test.mts",
     "docker": "npm run docker:build && npm run docker:run",
     "docker:build": "docker build -t  ai-webtv .",
     "docker:run": "docker run -it -p 7860:7860 video-service"
@@ -14,6 +15,7 @@
   "license": "Apache License",
   "dependencies": {
     "@gradio/client": "^0.1.4",
     "@types/express": "^4.17.17",
     "@types/uuid": "^9.0.2",
     "express": "^4.18.2",

   "scripts": {
     "start": "node --loader ts-node/esm src/index.mts",
     "test": "node --loader ts-node/esm src/test.mts",
+    "test2": "node --loader ts-node/esm src/test2.mts",
     "docker": "npm run docker:build && npm run docker:run",
     "docker:build": "docker build -t  ai-webtv .",
     "docker:run": "docker run -it -p 7860:7860 video-service"
   "license": "Apache License",
   "dependencies": {
     "@gradio/client": "^0.1.4",
+    "@huggingface/inference": "^2.6.1",
     "@types/express": "^4.17.17",
     "@types/uuid": "^9.0.2",
     "express": "^4.18.2",

src/data/all_words.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/data/good_words.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/index.mts CHANGED Viewed

@@ -1,63 +1,62 @@
-import { promises as fs } from 'fs'
-import path from 'node:path'
-import tmpDir from 'temp-dir'
-import express from 'express'
-import { generateVideo } from './services/generateVideo.mts'
-import { downloadVideo } from './services/downloadVideo.mts'
-import { upscaleVideo } from './services/upscaleVideo.mts'
-import { generateSeed } from './services/generateSeed.mts'
-import { addAudioToVideo } from './services/addAudioToVideo.mts'
-import { MakeShot } from './types.mts'
 const app = express()
 const port = 7860
 app.use(express.json())
-app.post('/shot', async (req, res) => {
-  const query = req.body as MakeShot
-  console.log('received query:', query)
-  const token = `${query.token || ''}`
   if (token !== process.env.VS_SECRET_ACCESS_TOKEN) {
     console.log("couldn't find access token in the query")
-    res.write(JSON.stringify({ error: true, message: 'access denied' }))
     res.end()
     return
   }
-  const shotPrompt = `${query.shotPrompt || ''}`
   if (shotPrompt.length < 5) {
-    res.write(JSON.stringify({ error: true, message: 'prompt too short (must be at least 5 in length)' }))
     res.end()
     return
   }
   // optional video URL
-  // const inputVideo = `${req.query.inputVideo || ''}`
-  // optional audio prompt
-  const audioPrompt = `${query.audioPrompt || ''}`
   // optional seed
   const defaultSeed = generateSeed()
   const seedStr = Number(`${query.seed || defaultSeed}`)
   const maybeSeed = Number(seedStr)
   const seed = isNaN(maybeSeed) || ! isFinite(maybeSeed) ? defaultSeed : maybeSeed
-  // should we upscale or not?
-  const upscale = `${query.upscale || 'true'}` === 'true'
-  // duration of the prompt, in seconds
   const defaultDuration = 3
   const durationStr = Number(`${query.duration || defaultDuration}`)
   const maybeDuration = Number(durationStr)
-  const duration = Math.min(3, Math.max(1, isNaN(maybeDuration) || !isFinite(maybeDuration) ? defaultDuration : maybeDuration))
   const defaultSteps = 35
   const stepsStr = Number(`${query.steps || defaultSteps}`)
@@ -68,58 +67,45 @@ app.post('/shot', async (req, res) => {
   const defaultFps = 24
   const fpsStr = Number(`${query.fps || defaultFps}`)
   const maybeFps = Number(fpsStr)
-  const fps = Math.min(60, Math.max(8, isNaN(maybeFps) || !isFinite(maybeFps) ? defaultFps : maybeFps))
   const defaultResolution = 576
   const resolutionStr = Number(`${query.resolution || defaultResolution}`)
   const maybeResolution = Number(resolutionStr)
   const resolution = Math.min(1080, Math.max(256, isNaN(maybeResolution) || !isFinite(maybeResolution) ? defaultResolution : maybeResolution))
-  const shotFileName = `${Date.now()}.mp4`
-  console.log('generating video with the following params:', {
     shotPrompt,
-    audioPrompt,
-    resolution,
     duration,
     nbSteps,
-    fps,
-    seed,
     upscale,
-    shotFileName
   })
-  console.log('generating base video ..')
-  const generatedVideoUrl = await generateVideo(shotPrompt, {
-    seed,
-    nbFrames: 24, // if we try more eg 48 frames, this will crash the upscaler (not enough memory)
-    nbSteps
-  })
-  console.log('downloading video..')
-  const videoFileName = await downloadVideo(generatedVideoUrl, shotFileName)
-  if (upscale) {
-    console.log('upscaling video..')
-    await upscaleVideo(videoFileName, shotPrompt)
-  }
-  // TODO call AudioLDM
-  if (audioPrompt) {
-    // const audioFileName = await callAudioLDM(audioPrompt)
-    console.log('calling audio prompt')
-    // await addAudioToVideo(videoFileName, audioFileName)
-  }
-  console.log('returning result to user..')
-  const filePath = path.resolve(tmpDir, videoFileName)
   const buffer = await fs.readFile(filePath)
-  res.setHeader('Content-Type', 'media/mp4')
-  res.setHeader('Content-Length', buffer.length)
   res.end(buffer)
 })

+import { promises as fs } from "fs"
+import express from "express"
+import { generateSeed } from "./services/generateSeed.mts"
+import { Job, ShotQuery } from "./types.mts"
+import { generateShot } from "./services/generateShot.mts"
 const app = express()
 const port = 7860
 app.use(express.json())
+const queue: Job[] = []
+app.post("/shot", async (req, res) => {
+  const query = req.body as ShotQuery
+  const token = `${query.token || ""}`
   if (token !== process.env.VS_SECRET_ACCESS_TOKEN) {
     console.log("couldn't find access token in the query")
+    res.write(JSON.stringify({ error: true, message: "access denied" }))
     res.end()
     return
   }
+  const shotPrompt = `${query.shotPrompt || ""}`
   if (shotPrompt.length < 5) {
+    res.write(JSON.stringify({ error: true, message: "prompt too short (must be at least 5 in length)" }))
     res.end()
     return
   }
   // optional video URL
+  // const inputVideo = `${req.query.inputVideo || ""}`
+  // optional background audio prompt
+  const backgroundAudioPrompt = `${query.backgroundAudioPrompt || ""}`
+  // optional foreground audio prompt
+  const foregroundAudioPrompt = `${query.foregroundAudioPrompt || ""}`
   // optional seed
   const defaultSeed = generateSeed()
   const seedStr = Number(`${query.seed || defaultSeed}`)
   const maybeSeed = Number(seedStr)
   const seed = isNaN(maybeSeed) || ! isFinite(maybeSeed) ? defaultSeed : maybeSeed
+  // in production we want those ON by default
+  const upscale = `${query.upscale || "true"}` === "true"
+  const interpolate = `${query.upscale || "true"}` === "true"
+  const noise = `${query.noise || "true"}` === "true"
   const defaultDuration = 3
+  const maxDuration = 5
   const durationStr = Number(`${query.duration || defaultDuration}`)
   const maybeDuration = Number(durationStr)
+  const duration = Math.min(maxDuration, Math.max(1, isNaN(maybeDuration) || !isFinite(maybeDuration) ? defaultDuration : maybeDuration))
   const defaultSteps = 35
   const stepsStr = Number(`${query.steps || defaultSteps}`)
   const defaultFps = 24
   const fpsStr = Number(`${query.fps || defaultFps}`)
   const maybeFps = Number(fpsStr)
+  const nbFrames = Math.min(60, Math.max(8, isNaN(maybeFps) || !isFinite(maybeFps) ? defaultFps : maybeFps))
   const defaultResolution = 576
   const resolutionStr = Number(`${query.resolution || defaultResolution}`)
   const maybeResolution = Number(resolutionStr)
   const resolution = Math.min(1080, Math.max(256, isNaN(maybeResolution) || !isFinite(maybeResolution) ? defaultResolution : maybeResolution))
+  const actorPrompt = `${query.actorPrompt || ""}`
+  const actorVoicePrompt = `${query.actorVoicePrompt || ""}`
+  const actorDialoguePrompt = `${query.actorDialoguePrompt || ""}`
+  const { filePath } = await generateShot({
+    seed,
+    actorPrompt,
     shotPrompt,
+    backgroundAudioPrompt,
+    foregroundAudioPrompt,
+    actorDialoguePrompt,
+    actorVoicePrompt,
     duration,
+    nbFrames,
+    resolution,
     nbSteps,
     upscale,
+    interpolate,
+    noise,
   })
+  console.log(`generated video in ${filePath}`)
+  console.log("returning result to user..")
   const buffer = await fs.readFile(filePath)
+  res.setHeader("Content-Type", "media/mp4")
+  res.setHeader("Content-Length", buffer.length)
   res.end(buffer)
 })

src/services/addAudioToVideo.mts CHANGED Viewed

@@ -1,29 +1,45 @@
-import path from 'node:path'
-import { promises as fs } from 'node:fs'
-import tmpDir from 'temp-dir'
-import ffmpeg from 'fluent-ffmpeg'
-export const addAudioToVideo = async (videoFilePath: string, audioFilePath: string): Promise<string> => {
-  const tempOutputFilePath = `${videoFilePath.split('.')[0]}-temp.mp4`
   await new Promise((resolve, reject) => {
     ffmpeg(videoFilePath)
       .input(audioFilePath)
-      .outputOptions('-c:v copy')  // use video copy codec
-      .outputOptions('-c:a aac')   // use audio codec
-      .outputOptions('-map 0:v:0') // map video from 0th to 0th
-      .outputOptions('-map 1:a:0') // map audio from 1st to 0th
-      .outputOptions('-shortest') // finish encoding when shortest input stream ends
       .output(tempOutputFilePath)
-      .on('end', resolve)
-      .on('error', reject)
       .run()
   })
   // Now we want to replace the original video file with the new file that has been created
   await fs.rename(tempOutputFilePath, videoFilePath)
-  return videoFilePath
 };

+import { promises as fs } from "node:fs"
+import path from "node:path"
+import tmpDir from "temp-dir"
+import { v4 as uuidv4 } from "uuid"
+import ffmpeg from "fluent-ffmpeg"
+export const addAudioToVideo = async (
+  videoFileName: string,
+  audioFileName: string,
+  /*
+  * 0.0: mute the audio completely
+  * 0.5: set the audio to 50% of original volume (half volume)
+  * 1.0: maintain the audio at original volume (100% of original volume)
+  * 2.0: amplify the audio to 200% of original volume (double volume - might cause clipping)
+  */
+  volume: number = 1.0
+): Promise<string> => {
+  const tempOutputFilePath = `${uuidv4()}.mp4`
+  const videoFilePath = path.resolve(tmpDir, videoFileName)
+  const audioFilePath = path.resolve(tmpDir, audioFileName)
   await new Promise((resolve, reject) => {
     ffmpeg(videoFilePath)
       .input(audioFilePath)
+      .audioFilters({ filter: 'volume', options: volume }) // add audio filter for volume
+      .outputOptions("-c:v copy")  // use video copy codec
+      .outputOptions("-c:a aac")   // use audio codec
+      .outputOptions("-map 0:v:0") // map video from 0th to 0th
+      .outputOptions("-map 1:a:0") // map audio from 1st to 0th
+      .outputOptions("-shortest") // finish encoding when shortest input stream ends
       .output(tempOutputFilePath)
+      .on("end", resolve)
+      .on("error", reject)
       .run()
   })
   // Now we want to replace the original video file with the new file that has been created
   await fs.rename(tempOutputFilePath, videoFilePath)
+  return videoFileName
 };

src/services/generateActor.mts ADDED Viewed

	@@ -0,0 +1,50 @@

+import { promises as fs } from "node:fs"
+import path from "node:path"
+import tmpDir from "temp-dir"
+import { HfInference } from "@huggingface/inference"
+const hf = new HfInference(process.env.VS_HF_API_TOKEN)
+export const generateActor = async (prompt: string, fileName: string, seed: number) => {
+  const positivePrompt = [
+    `profile photo of ${prompt || ""}`,
+    "id picture",
+    "photoshoot",
+    "portrait photography",
+    "neutral expression",
+    "neutral background",
+    "studio photo",
+    "award winning",
+    "high resolution",
+    "photo realistic",
+    "intricate details",
+    "beautiful",
+  ]
+  const negativePrompt = [
+    "anime",
+    "drawing",
+    "painting",
+    "lowres",
+    "blurry",
+    "artificial"
+  ]
+  console.log(`generating actor: ${positivePrompt.join(", ")}`)
+  const blob = await hf.textToImage({
+    inputs: positivePrompt.join(", "),
+    model: "stabilityai/stable-diffusion-2-1",
+    parameters: {
+      negative_prompt: negativePrompt.join(", "),
+      // seed, no seed?
+    }
+  })
+  const filePath = path.resolve(tmpDir, fileName)
+  const buffer = Buffer.from(await blob.arrayBuffer())
+  await fs.writeFile(filePath, buffer, "utf8")
+  return filePath
+}

src/services/generateAudio.mts CHANGED Viewed

@@ -1,33 +1,56 @@
-import { client } from '@gradio/client'
-import { generateSeed } from "./generateSeed.mts"
 const instances: string[] = [
   process.env.VS_AUDIO_GENERATION_SPACE_API_URL
 ]
-export const generateAudio = async (prompt: string, options?: {
-  seed: number;
-  nbFrames: number;
-  nbSteps: number;
-}) => {
-  const seed = options?.seed || generateSeed()
-  const nbFrames = options?.nbFrames || 24 // we can go up to 48 frames, but then upscaling quill require too much memory!
-  const nbSteps = options?.nbSteps || 35
   const instance = instances.shift()
   instances.push(instance)
-  const api = await client(instance)
-  const rawResponse = await api.predict('/run', [
-    prompt, // string  in 'Prompt' Textbox component
-    seed, // number (numeric value between 0 and 2147483647) in 'Seed' Slider component
-    nbFrames, // 24 // it is the nb of frames per seconds I think?
-    nbSteps, // 10, (numeric value between 10 and 50) in 'Number of inference steps' Slider component
-  ]) as any
-  const { name } = rawResponse?.data?.[0]?.[0] as { name: string, orig_name: string }
-  return `${instance}/file=${name}`
 }

+import puppeteer from "puppeteer"
+import { downloadVideo } from "./downloadVideo.mts"
 const instances: string[] = [
   process.env.VS_AUDIO_GENERATION_SPACE_API_URL
 ]
+// TODO we should use an inference endpoint instead
+export async function generateAudio(prompt: string, audioFileName: string) {
   const instance = instances.shift()
   instances.push(instance)
+  console.log("instance:", instance)
+  const browser = await puppeteer.launch({
+    headless: false,
+    protocolTimeout: 800000,
+  })
+  const page = await browser.newPage()
+  await page.goto(instance, {
+    waitUntil: "networkidle2",
+  })
+  await new Promise(r => setTimeout(r, 3000))
+  const firstTextboxInput = await page.$('input[data-testid="textbox"]')
+  await firstTextboxInput.type(prompt)
+  // console.log("looking for the button to submit")
+  const submitButton = await page.$("button.lg")
+  // console.log("clicking on the button")
+  await submitButton.click()
+  await page.waitForSelector("a[download]", {
+    timeout: 800000, // need to be large enough in case someone else attemps to use our space
+  })
+  const audioRemoteUrl = await page.$$eval("a[download]", el => el.map(x => x.getAttribute("href"))[0])
+  console.log({
+    audioRemoteUrl,
+  })
+  // console.log("downloading file from space..")
+  console.log(`- downloading ${audioFileName} from ${audioRemoteUrl}`)
+  await downloadVideo(audioRemoteUrl, audioFileName)
+  return audioFileName
 }

src/services/generateAudioLegacy.mts ADDED Viewed

	@@ -0,0 +1,33 @@

+import { client } from '@gradio/client'
+import { generateSeed } from "./generateSeed.mts"
+const instances: string[] = [
+  process.env.VS_AUDIO_GENERATION_SPACE_API_URL
+]
+export const generateAudio = async (prompt: string, options?: {
+  seed: number;
+  nbFrames: number;
+  nbSteps: number;
+}) => {
+  const seed = options?.seed || generateSeed()
+  const nbFrames = options?.nbFrames || 24 // we can go up to 48 frames, but then upscaling quill require too much memory!
+  const nbSteps = options?.nbSteps || 35
+  const instance = instances.shift()
+  instances.push(instance)
+  const api = await client(instance)
+  const rawResponse = await api.predict('/run', [
+    prompt, // string  in 'Prompt' Textbox component
+    seed, // number (numeric value between 0 and 2147483647) in 'Seed' Slider component
+    nbFrames, // 24 // it is the nb of frames per seconds I think?
+    nbSteps, // 10, (numeric value between 10 and 50) in 'Number of inference steps' Slider component
+  ]) as any
+  const { name } = rawResponse?.data?.[0]?.[0] as { name: string, orig_name: string }
+  return `${instance}/file=${name}`
+}

src/services/generateShot.mts ADDED Viewed

	@@ -0,0 +1,209 @@

+import path from "node:path"
+import { v4 as uuidv4 } from "uuid"
+import tmpDir from "temp-dir"
+import { downloadVideo } from "./downloadVideo.mts"
+import { generateAudio } from "./generateAudio.mts"
+import { generateVideo } from "./generateVideo.mts"
+import { upscaleVideo } from "./upscaleVideo.mts"
+import { generateVoice } from "./generateVoice.mts"
+import { generateSeed } from "./generateSeed.mts"
+import { mergeAudio } from "./mergeAudio.mts"
+import { addAudioToVideo } from "./addAudioToVideo.mts"
+import { interpolateVideo } from "./interpolateVideo.mts"
+import { postInterpolation } from "./postInterpolation.mts"
+export const generateShot = async ({
+  seed = 0,
+  shotId = "",
+  actorPrompt = "",
+  shotPrompt = "",
+  backgroundAudioPrompt = "",
+  foregroundAudioPrompt = "",
+  actorDialoguePrompt = "",
+  actorVoicePrompt = "",
+  duration = 2,
+  nbFrames = 24,
+  resolution = 576,
+  nbSteps = 35,
+  upscale = true,
+  interpolate = true,
+  noise = true,
+}: {
+  seed?: number;
+  shotId?: string;
+  actorPrompt?: string;
+  shotPrompt?: string;
+  backgroundAudioPrompt?: string;
+  foregroundAudioPrompt?: string;
+  actorDialoguePrompt?: string;
+  actorVoicePrompt?: string;
+  duration?: number; // 2 seconds
+  nbFrames?: number; // 24 FPS
+  resolution?: number; // 256, 320, 512, 576, 720, 1080..
+  nbSteps?: number;
+  upscale?: boolean;
+  interpolate?: boolean;
+  noise?: boolean;
+}) => {
+  seed = seed || generateSeed()
+  shotId = shotId || uuidv4()
+  const shotFileName = `${shotId}.mp4`
+  console.log("generating video shot:", {
+    seed,
+    shotId,
+    actorPrompt,
+    shotPrompt,
+    backgroundAudioPrompt,
+    foregroundAudioPrompt,
+    actorDialoguePrompt,
+    actorVoicePrompt,
+    duration,
+    nbFrames,
+    resolution,
+    nbSteps,
+    upscale,
+    interpolate,
+    noise,
+  })
+  if (actorPrompt) {
+    console.log("generating actor..")
+    const actorIdentityFileName = `actor_${Date.now()}.png`
+    // await generateActor(actorPrompt, actorIdentityFileName, seed)
+  }
+  console.log("generating base video ..")
+  let generatedVideoUrl = ""
+  // currenty the base model is incapable of generating more than 24 FPS,
+  // because otherwise the upscaler will have trouble
+  // so for now, we fix it to 24 frames
+  // const nbFramesForBaseModel = Math.min(3, Math.max(1, Math.round(duration))) * 8
+  const nbFramesForBaseModel = 24
+  try {
+    generatedVideoUrl = await generateVideo(shotPrompt, {
+      seed,
+      nbFrames: nbFramesForBaseModel,
+      nbSteps
+    })
+  } catch (err) {
+    // upscaling can be finicky, if it fails we try again
+    console.log('- trying again to generate base shot..')
+    generatedVideoUrl = await generateVideo(shotPrompt, {
+      seed,
+      nbFrames: nbFramesForBaseModel,
+      nbSteps
+    })
+  }
+  console.log("downloading video..")
+  const videoFileName = await downloadVideo(generatedVideoUrl, shotFileName)
+  if (upscale) {
+    console.log("upscaling video..")
+    try {
+      await upscaleVideo(videoFileName, shotPrompt)
+    } catch (err) {
+      // upscaling can be finicky, if it fails we try again
+      console.log('- trying again to upscale shot..')
+      await upscaleVideo(videoFileName, shotPrompt)
+    }
+  }
+  if (interpolate) {
+    console.log("upscaling video..")
+    // ATTENTION 1:
+    // the interpolation step always create a SLOW MOTION video
+    // it means it can last a lot longer (eg. 2x, 3x, 4x.. longer)
+    // than the duration generated by the original video model
+    // ATTENTION 2:
+    // the interpolation step generates videos in 910x512!
+    // ATTENTION 3:
+    // the interpolation step parameters are currently not passed to the space,
+    // so changing those two variables below will have no effect!
+    const interpolationSteps = 3
+    const interpolatedFramesPerSecond = 24
+    await interpolateVideo(
+      videoFileName,
+      interpolationSteps,
+      interpolatedFramesPerSecond
+    )
+    console.log('creating slow-mo video (910x512 @ 24 FPS)')
+    // with our current interpolation settings, the 3 seconds video generated by the model
+    // become a 7 seconds video, at 24 FPS
+    // so we want to scale it back to the desired duration length
+    // also, as a last trick we want to upscale it (without AI) and add some FXs
+    console.log('performing final scaling (1280x720 @ 24 FPS)')
+    await postInterpolation(videoFileName, duration, nbFrames)
+  }
+  let backgroundAudioFileName = ''
+  if (backgroundAudioPrompt) {
+    console.log("generating background audio..")
+    backgroundAudioFileName = await generateAudio(backgroundAudioPrompt, `shot_${shotId}_audio_${uuidv4}.m4a`)
+  }
+  let foregroundAudioFileName = ''
+  if (foregroundAudioPrompt) {
+    console.log("generating foreground audio..")
+    foregroundAudioFileName = await generateAudio(foregroundAudioPrompt, `shot_${shotId}_audio_${uuidv4()}.m4a`)
+  }
+  let voiceAudioFileName = ''
+  if (actorDialoguePrompt) {
+    console.log("configuring dialogue..")
+    if (actorVoicePrompt) {
+      console.log("configuring voice..")
+      // well.. that's a TODO!
+      // for now let's always use the same voice model
+      console.log('TODO this should be done in the sequence, not the prompt!')
+      voiceAudioFileName = await generateVoice(actorDialoguePrompt, `shot_${shotId}_voice_${uuidv4()}.m4a`)
+    }
+  }
+  console.log('merging audio with video..')
+  if (backgroundAudioFileName || foregroundAudioFileName) {
+    let audioFileName = ''
+    // we have both background and foreground
+    if (backgroundAudioFileName && foregroundAudioFileName) {
+      audioFileName = await mergeAudio({
+        input1FileName: backgroundAudioFileName,
+        input1Volume: 0.2,// 20% volume
+        input2FileName: foregroundAudioFileName,
+        input2Volume: 0.7, // 70% volume
+      })
+    } else if (backgroundAudioFileName) {
+      audioFileName = backgroundAudioFileName
+    } else if (foregroundAudioFileName) {
+      audioFileName = foregroundAudioFileName
+    }
+    await addAudioToVideo(videoFileName, audioFileName)
+  }
+  console.log("returning result to user..")
+  const filePath = path.resolve(tmpDir, videoFileName)
+  return {
+    shotId,
+    filePath,
+    videoFileName
+  }
+}

src/services/generateVideo.mts CHANGED Viewed

@@ -1,4 +1,5 @@
-import { client } from '@gradio/client'
 import { generateSeed } from "./generateSeed.mts"


1	+ import { client } from "@gradio/client"
2	+
3
4	import { generateSeed } from "./generateSeed.mts"
5

src/services/generateVoice.mts ADDED Viewed

	@@ -0,0 +1,56 @@

+import puppeteer from "puppeteer"
+import { downloadVideo } from "./downloadVideo.mts"
+const instances: string[] = [
+  process.env.VS_VOICE_GENERATION_SPACE_API_URL
+]
+// TODO we should use an inference endpoint instead
+export async function generateVoice(prompt: string, voiceFileName: string) {
+  const instance = instances.shift()
+  instances.push(instance)
+  console.log("instance:", instance)
+  const browser = await puppeteer.launch({
+    headless: false,
+    protocolTimeout: 800000,
+  })
+  const page = await browser.newPage()
+  await page.goto(instance, {
+    waitUntil: "networkidle2",
+  })
+  await new Promise(r => setTimeout(r, 3000))
+  const firstTextarea = await page.$('textarea[data-testid="textbox"]')
+  await firstTextarea.type(prompt)
+  // console.log("looking for the button to submit")
+  const submitButton = await page.$("button.lg")
+  // console.log("clicking on the button")
+  await submitButton.click()
+  await page.waitForSelector("audio", {
+    timeout: 800000, // need to be large enough in case someone else attemps to use our space
+  })
+  const voiceRemoteUrl = await page.$$eval("audio", el => el.map(x => x.getAttribute("src"))[0])
+  console.log({
+    voiceRemoteUrl,
+  })
+  console.log(`- downloading ${voiceFileName} from ${voiceRemoteUrl}`)
+  await downloadVideo(voiceRemoteUrl, voiceFileName)
+  return voiceFileName
+}

src/services/interpolateVideo.mts CHANGED Viewed

@@ -1,40 +1,53 @@
-import { promises as fs } from "node:fs"
 import path from "node:path"
-import { Blob } from "buffer"
-// import { blobFrom } from "fetch-blob"
-import { client } from "@gradio/client"
 import tmpDir from "temp-dir"
-import { downloadVideo } from './downloadVideo.mts'
 const instances: string[] = [
   process.env.VS_VIDEO_INTERPOLATION_SPACE_API_URL
 ]
-export const interpolateVideo = async (fileName: string) => {
   const inputFilePath = path.join(tmpDir, fileName)
   const instance = instances.shift()
   instances.push(instance)
-  const api = await client(instance)
-  const video = await fs.readFile(inputFilePath)
-  const blob = new Blob([video], { type: 'video/mp4' })
-  // const blob = blobFrom(filePath)
-  const result = await api.predict(1, [
-    blob, 	// blob in 'parameter_5' Video component
-    1, // number (numeric value between 1 and 4) in 'Interpolation Steps' Slider component
-    24, // string  in 'FPS output' Radio component
-  ])
-  const data = (result as any).data[0]
-  console.log('raw data:', data)
-  const { orig_name, data: remoteFilePath } = data
-  const remoteUrl = `${instance}/file=${remoteFilePath}`
-  console.log("remoteUrl:", remoteUrl)
-  await downloadVideo(remoteUrl, fileName)
 }

 import path from "node:path"
+import puppeteer from "puppeteer"
 import tmpDir from "temp-dir"
+import { downloadVideo } from "./downloadVideo.mts"
 const instances: string[] = [
   process.env.VS_VIDEO_INTERPOLATION_SPACE_API_URL
 ]
+// TODO we should use an inference endpoint instead
+export async function interpolateVideo(fileName: string, steps: number, fps: number) {
   const inputFilePath = path.join(tmpDir, fileName)
+  console.log(`interpolating ${fileName}`)
+  console.log(`warning: interpolateVideo parameter "${steps}" is ignored!`)
+  console.log(`warning: interpolateVideo parameter "${fps}" is ignored!`)
   const instance = instances.shift()
   instances.push(instance)
+  const browser = await puppeteer.launch({
+    headless: true,
+    protocolTimeout: 400000,
+  })
+  const page = await browser.newPage()
+  await page.goto(instance, { waitUntil: 'networkidle2' })
+  await new Promise(r => setTimeout(r, 3000))
+  const fileField = await page.$('input[type=file]')
+  // console.log(`uploading file..`)
+  await fileField.uploadFile(inputFilePath)
+  // console.log('looking for the button to submit')
+  const submitButton = await page.$('button.lg')
+  // console.log('clicking on the button')
+  await submitButton.click()
+  await page.waitForSelector('a[download="interpolated_result.mp4"]', {
+    timeout: 400000, // need to be large enough in case someone else attemps to use our space
+  })
+  const interpolatedFileUrl = await page.$$eval('a[download="interpolated_result.mp4"]', el => el.map(x => x.getAttribute("href"))[0])
+  await downloadVideo(interpolatedFileUrl, fileName)
+  return fileName
 }

src/services/interpolateVideoLegacy.mts ADDED Viewed

	@@ -0,0 +1,39 @@

+import { promises as fs } from "node:fs"
+import path from "node:path"
+import { Blob } from "buffer"
+import { client } from "@gradio/client"
+import tmpDir from "temp-dir"
+import { downloadVideo } from './downloadVideo.mts'
+const instances: string[] = [
+  process.env.VS_VIDEO_INTERPOLATION_SPACE_API_URL
+]
+export const interpolateVideo = async (fileName: string, steps: number, fps: number) => {
+  const inputFilePath = path.join(tmpDir, fileName)
+  const instance = instances.shift()
+  instances.push(instance)
+  const api = await client(instance)
+  const video = await fs.readFile(inputFilePath)
+  const blob = new Blob([video], { type: 'video/mp4' })
+  // const blob = blobFrom(filePath)
+  const result = await api.predict(1, [
+    blob, 	// blob in 'parameter_5' Video component
+    steps, // number (numeric value between 1 and 4) in 'Interpolation Steps' Slider component
+    fps, // string (FALSE! it's a number)  in 'FPS output' Radio component
+  ])
+  const data = (result as any).data[0]
+  console.log('raw data:', data)
+  const { orig_name, data: remoteFilePath } = data
+  const remoteUrl = `${instance}/file=${remoteFilePath}`
+  console.log("remoteUrl:", remoteUrl)
+  await downloadVideo(remoteUrl, fileName)
+}

src/services/mergeAudio.mts ADDED Viewed

	@@ -0,0 +1,49 @@

+import path from "node:path"
+import tmpDir from "temp-dir"
+import { v4 as uuidv4 } from "uuid"
+import ffmpeg from "fluent-ffmpeg"
+export const mergeAudio = async ({
+  input1FileName,
+  input1Volume,
+  input2FileName,
+  input2Volume,
+  outputFileName = ''
+}: {
+  input1FileName: string,
+  input1Volume: number,
+  input2FileName: string,
+  input2Volume: number,
+  outputFileName?: string
+}): Promise<string> => {
+  outputFileName = `${uuidv4()}.m4a`
+  const input1FilePath = path.resolve(tmpDir, input1FileName)
+  const input2FilePath = path.resolve(tmpDir, input2FileName)
+  const outputFilePath = path.resolve(tmpDir, outputFileName)
+  const input1Ffmpeg = ffmpeg(input1FilePath)
+    .outputOptions("-map 0:a:0")
+    .audioFilters([{ filter: 'volume', options: input1Volume }]); // set volume for main audio
+  const input2Ffmpeg = ffmpeg(input2FilePath)
+    .outputOptions("-map 1:a:0")
+    .audioFilters([{ filter: 'volume', options: input2Volume }]); // set volume for additional audio
+  await new Promise((resolve, reject) => {
+    ffmpeg()
+      .input(input1Ffmpeg)
+      .input(input2Ffmpeg)
+      .outputOptions("-c:a aac")   // use audio codec
+      .outputOptions("-shortest")  // finish encoding when shortest input stream ends
+      .output(outputFilePath)
+      .on("end", resolve)
+      .on("error", reject)
+      .run()
+  })
+  console.log(`merged audio from ${input1FileName} and ${input2FileName} into ${outputFileName}`)
+  return outputFileName
+}

src/services/postInterpolation.mts ADDED Viewed

	@@ -0,0 +1,57 @@

+import path from "node:path"
+import fs from "node:fs"
+import { v4 as uuidv4 } from "uuid"
+import tmpDir from "temp-dir"
+import ffmpeg from "fluent-ffmpeg"
+export const postInterpolation = async (fileName: string, duration: number, nbFrames: number): Promise<string> => {
+  return new Promise((resolve,reject) => {
+    const tmpFileName = `${uuidv4()}.mp4`
+    const filePath = path.join(tmpDir, fileName)
+    const tmpFilePath = path.join(tmpDir, tmpFileName)
+    ffmpeg.ffprobe(filePath, function(err, metadata) {
+      if (err) { reject(err); return; }
+      const currentVideoDuration = metadata.format.duration
+      // compute a ratio ex. 0.3 = 30% of the total length
+      const durationRatio = currentVideoDuration / duration
+    ffmpeg(filePath)
+      // convert to HD
+      .size("1280x720")
+      .videoFilters([
+        `setpts=${durationRatio}*PTS`, // we make the video faster
+        //'scale=-1:576:lanczos',
+        // 'unsharp=5:5:0.2:5:5:0.2', // not recommended, this make the video more "pixely"
+        'noise=c0s=10:c0f=t+u' // add a movie grain noise
+      ])
+      .outputOptions([
+        `-r ${nbFrames}`,
+      ])
+      .save(tmpFilePath)
+      .on("end", async () => {
+        await fs.promises.copyFile(tmpFilePath, filePath)
+        try {
+          await fs.promises.unlink(tmpFilePath)
+        } catch (err) {
+          console.log("failed to cleanup (no big deal..)")
+        }
+        resolve(fileName)
+      })
+      .on("error", (err) => {
+        reject(err)
+      })
+    })
+  })
+}

src/test2.mts ADDED Viewed

	@@ -0,0 +1,7 @@

+import { generateAudio } from "./services/generateAudio.mts"
+console.log('generating background audio..')
+const audioFileName = await generateAudio("sounds of a castle bell ringing alarm", "test_juju_audio.mp3")
+console.log('result:', audioFileName)

src/types.mts CHANGED Viewed

@@ -26,19 +26,40 @@ export interface Database {
 }
-export interface MakeShot {
   token: string
   shotPrompt: string
   // inputVideo?: string
-  audioPrompt?: string
   seed?: number
   upscale?: boolean
   duration?: number
   steps?: number
   fps?: number // 8, 12, 24, 30, 60
   resolution?: number // 256, 512, 576, 720, 1080
 }

 }
+export interface ShotQuery {
   token: string
   shotPrompt: string
   // inputVideo?: string
+  // describe the background audio (crowd, birds, wind, sea etc..)
+  backgroundAudioPrompt?: string
+  // describe the foreground audio (cars revving, footsteps, objects breaking, explosion etc)
+  foregroundAudioPrompt?: string
+  // describe the main actor visible in the shot (optional)
+  actorPrompt?: string
+  // describe the main actor voice (man, woman, old, young, amused, annoyed.. etc)
+  actorVoicePrompt?: string
+  // describe the main actor dialogue line
+  actorDialoguePrompt?: string
   seed?: number
   upscale?: boolean
+  noise?: boolean // add movie noise
   duration?: number
   steps?: number
   fps?: number // 8, 12, 24, 30, 60
   resolution?: number // 256, 512, 576, 720, 1080
+}
+export interface Job {
+  startedAt: string
+  query: ShotQuery
 }