Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Michael Sapienza
commited on
Commit
·
ec17e66
0
Parent(s):
initial commit of sutra-avatar-v2
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +39 -0
- .gitignore +1 -0
- README.md +13 -0
- app.py +441 -0
- base_task_executor.py +179 -0
- cloud_task_executor.py +143 -0
- data/input_audio/gradio/female/en-BeesWingsBeat-Shelby.mp3 +3 -0
- data/input_audio/gradio/female/en-EnhanceEfficiency-Shelby.mp3 +3 -0
- data/input_audio/gradio/female/en-The2026WorldCup-Shelby.mp3 +3 -0
- data/input_audio/gradio/female/hi-BeesWingsBeat-Matilda.mp3 +3 -0
- data/input_audio/gradio/female/hi-EnhanceEfficiency-Matilda.mp3 +3 -0
- data/input_audio/gradio/female/hi-The2026WorldCup-Matilda.mp3 +3 -0
- data/input_audio/gradio/female/ko-BeesWingsBeat-Jinju.mp3 +3 -0
- data/input_audio/gradio/female/ko-EnhanceEfficiency-Jinju.mp3 +3 -0
- data/input_audio/gradio/female/ko-The2026WorldCup-Jinju.mp3 +3 -0
- data/input_audio/gradio/male/en-BeesWingsBeat-Marcus.mp3 +3 -0
- data/input_audio/gradio/male/en-EnhanceEfficiency-Marcus.mp3 +3 -0
- data/input_audio/gradio/male/en-The2026WorldCup-Marcus.mp3 +3 -0
- data/input_audio/gradio/male/hi-BeesWingsBeat-Liam.mp3 +3 -0
- data/input_audio/gradio/male/hi-EnhanceEfficiency-Liam.mp3 +3 -0
- data/input_audio/gradio/male/hi-The2026WorldCup-Liam.mp3 +3 -0
- data/input_audio/gradio/male/ko-BeesWingsBeat-Noah.mp3 +3 -0
- data/input_audio/gradio/male/ko-EnhanceEfficiency-Noah.mp3 +3 -0
- data/input_audio/gradio/male/ko-The2026WorldCup-Noah.mp3 +3 -0
- data/input_image_bases/female/01-Female-American_608.jpg +3 -0
- data/input_image_bases/female/02-Female-Indian01_608.jpg +3 -0
- data/input_image_bases/female/03-Female-Korean_608.jpg +3 -0
- data/input_image_bases/female/04-Female-Indian02_608.jpg +3 -0
- data/input_image_bases/female/05-Female-European_608.jpg +3 -0
- data/input_image_bases/male/01-Male-Indian_608.jpg +3 -0
- data/input_image_bases/male/02-Male-Korean_608.jpg +3 -0
- data/input_image_bases/male/03-Male-European_608.jpg +3 -0
- data/input_image_bases/male/04-Male-American_608.jpg +3 -0
- data/input_image_bases/male/05-Male-AfricanAmerican_608.jpg +3 -0
- data/input_video_bases/female/01-Female-Korean_608.mp4 +3 -0
- data/input_video_bases/female/02-Female-Latina_608.mp4 +3 -0
- data/input_video_bases/female/03-Female-European_608.mp4 +3 -0
- data/input_video_bases/female/04-Female-Indian_608.mp4 +3 -0
- data/input_video_bases/female/05-Female-American_608.mp4 +3 -0
- data/input_video_bases/male/01-Male-Japanese_608.mp4 +3 -0
- data/input_video_bases/male/02-Male-European_608.mp4 +3 -0
- data/input_video_bases/male/03-Male-American02_608.mp4 +3 -0
- data/input_video_bases/male/04-Male-Indian_608.mp4 +3 -0
- data/input_video_bases/male/05-Male-American_608.mp4 +3 -0
- data/showcase_examples/archive/01 Multilingual Female_720.mp4 +3 -0
- data/showcase_examples/archive/02 Multilingual Male_720.mp4 +3 -0
- data/showcase_examples/archive/02 Multilingual Male_720_IM.mp4 +3 -0
- data/showcase_examples/archive/03 Corporate Message_720.mp4 +3 -0
- data/showcase_examples/archive/04 Multi-Identities: Multilingual_720.mp4 +3 -0
- data/showcase_examples/archive/05 Multi-Identities: Rap_720.mp4 +3 -0
.gitattributes
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: sutra-avatar-v2
|
3 |
+
emoji: 🐨
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.3.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import glob
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
from cloud_task_executor import CloudTaskExecutor
|
11 |
+
from elevenlabs_helper import ElevenLabsHelper
|
12 |
+
|
13 |
+
# ---
|
14 |
+
talk_key = "talk"
|
15 |
+
valid_base_motion_expressions = [
|
16 |
+
f"{talk_key}-head",
|
17 |
+
f"{talk_key}-neutral",
|
18 |
+
"smile",
|
19 |
+
"approve",
|
20 |
+
"disapprove",
|
21 |
+
"confused",
|
22 |
+
"sad",
|
23 |
+
"surprised",
|
24 |
+
]
|
25 |
+
|
26 |
+
|
27 |
+
def get_default_base_motion_expression():
|
28 |
+
return valid_base_motion_expressions[0]
|
29 |
+
|
30 |
+
|
31 |
+
# ---
|
32 |
+
|
33 |
+
|
34 |
+
def get_sorted_filenames_in_dir(dir_path: str, ext: str = ".jpg", throw_if_empty: bool = True) -> list:
|
35 |
+
"""Return the sorted filenames in the spedified directory."""
|
36 |
+
p = Path(dir_path)
|
37 |
+
if not p.exists() and not p.is_dir():
|
38 |
+
raise RuntimeError(f"The path: {dir_path} does not exist")
|
39 |
+
|
40 |
+
if not os.listdir(dir_path):
|
41 |
+
message = f"The path: {dir_path} is empty"
|
42 |
+
if throw_if_empty:
|
43 |
+
raise RuntimeError(message)
|
44 |
+
else:
|
45 |
+
return []
|
46 |
+
|
47 |
+
search_string = str(dir_path) + "/*" + ext
|
48 |
+
return sorted(glob.glob(search_string))
|
49 |
+
|
50 |
+
|
51 |
+
# ---
|
52 |
+
|
53 |
+
|
54 |
+
description = """Experience a demo of the world's most advanced Text/Audio To Video (TTV) system, crafted by Two AI.
|
55 |
+
Sign up with Two AI to gain rapid, long-form generation, API keys, and more!"""
|
56 |
+
|
57 |
+
# Core constants
|
58 |
+
tmp_dir = "/tmp/gradio"
|
59 |
+
data_dir = "./data"
|
60 |
+
male_key = "male"
|
61 |
+
female_key = "female"
|
62 |
+
unknown_key = "unknown"
|
63 |
+
media_height = 512
|
64 |
+
|
65 |
+
# Male/Female
|
66 |
+
female_terms = ["Female", "Lady", "Woman"]
|
67 |
+
male_terms = ["Male", "Lad", "Man"]
|
68 |
+
|
69 |
+
# Elevenlabs Voices #
|
70 |
+
all_voices = ElevenLabsHelper.get_voices()
|
71 |
+
voices_ = [voice for voice in all_voices.voices if len(voice.name.split(" ")) < 2 and len(voice.name) < 10]
|
72 |
+
female_voice_names = ElevenLabsHelper.select_voices(voices_, labels={"gender": female_key, "age": "young"})
|
73 |
+
male_voice_names = ElevenLabsHelper.select_voices(voices_, labels={"gender": male_key, "age": "young"})
|
74 |
+
male_voice_names.remove("Priya")
|
75 |
+
voices = {
|
76 |
+
female_key: female_voice_names,
|
77 |
+
male_key: male_voice_names,
|
78 |
+
unknown_key: female_voice_names + male_voice_names,
|
79 |
+
}
|
80 |
+
|
81 |
+
# Examples
|
82 |
+
# Base Images
|
83 |
+
example_base_image_dir = os.path.join(data_dir, "input_image_bases")
|
84 |
+
example_base_images = {
|
85 |
+
female_key: get_sorted_filenames_in_dir(os.path.join(example_base_image_dir, female_key), ext=".jpg"),
|
86 |
+
male_key: get_sorted_filenames_in_dir(os.path.join(example_base_image_dir, male_key), ext=".jpg"),
|
87 |
+
}
|
88 |
+
|
89 |
+
# Base Videos
|
90 |
+
example_base_video_dir = os.path.join(data_dir, "input_video_bases")
|
91 |
+
example_source_videos = {
|
92 |
+
female_key: get_sorted_filenames_in_dir(os.path.join(example_base_video_dir, female_key), ext=".mp4"),
|
93 |
+
male_key: get_sorted_filenames_in_dir(os.path.join(example_base_video_dir, male_key), ext=".mp4"),
|
94 |
+
}
|
95 |
+
|
96 |
+
# Driving Audio
|
97 |
+
example_driving_audio_dir = os.path.join(data_dir, "input_audio/gradio")
|
98 |
+
example_driving_audios_male = get_sorted_filenames_in_dir(os.path.join(example_driving_audio_dir, male_key), ext=".mp3")
|
99 |
+
example_driving_audios_female = get_sorted_filenames_in_dir(
|
100 |
+
os.path.join(example_driving_audio_dir, female_key), ext=".mp3"
|
101 |
+
)
|
102 |
+
example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
|
103 |
+
|
104 |
+
# Driving Text
|
105 |
+
audio_text_groups = ["General", "Promotional Messages", "Pronunciation Practice"]
|
106 |
+
example_driving_audio_texts = {
|
107 |
+
"General": [
|
108 |
+
"The 2026 World Cup final match is in New York.",
|
109 |
+
"Enhance efficiency and cut costs with AI.",
|
110 |
+
"A bee's wings beat more than 200 times per second.",
|
111 |
+
"2026년 월드컵 결승전은 뉴욕에서 열립니다.",
|
112 |
+
"AI로 효율성을 높이고 비용을 절감하세요.",
|
113 |
+
"벌은 초당 200회 이상의 날개짓을 합니다.",
|
114 |
+
"2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
|
115 |
+
"AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
|
116 |
+
"मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
|
117 |
+
],
|
118 |
+
"Promotional Messages": [
|
119 |
+
"Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
|
120 |
+
"Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
|
121 |
+
"This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
|
122 |
+
"Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
|
123 |
+
],
|
124 |
+
"Pronunciation Practice": [
|
125 |
+
"A big black bug bit a big black dog on his big black nose.",
|
126 |
+
"Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
|
127 |
+
],
|
128 |
+
}
|
129 |
+
|
130 |
+
example_showcase_dir = os.path.join(data_dir, "showcase_examples")
|
131 |
+
examples_showcase = {
|
132 |
+
"make_image_talk_multilingual": get_sorted_filenames_in_dir(
|
133 |
+
os.path.join(example_showcase_dir, "make_image_talk_multilingual"), ext=".mp4"
|
134 |
+
),
|
135 |
+
"make_image_talk_cartoon": get_sorted_filenames_in_dir(
|
136 |
+
os.path.join(example_showcase_dir, "make_image_talk_cartoon"), ext=".mp4"
|
137 |
+
),
|
138 |
+
"make_image_talk_diff_angles": get_sorted_filenames_in_dir(
|
139 |
+
os.path.join(example_showcase_dir, "make_image_talk_diff_angles"), ext=".mp4"
|
140 |
+
),
|
141 |
+
"make_image_talk_hb": get_sorted_filenames_in_dir(
|
142 |
+
os.path.join(example_showcase_dir, "make_image_talk_hb"), ext=".mp4"
|
143 |
+
),
|
144 |
+
"make_video_talk_multilingual": get_sorted_filenames_in_dir(
|
145 |
+
os.path.join(example_showcase_dir, "make_video_talk_multilingual"), ext=".mp4"
|
146 |
+
),
|
147 |
+
"make_video_talk_corp_msg": get_sorted_filenames_in_dir(
|
148 |
+
os.path.join(example_showcase_dir, "make_video_talk_corp_msg"), ext=".mp4"
|
149 |
+
),
|
150 |
+
"make_video_talk_rap_multii": get_sorted_filenames_in_dir(
|
151 |
+
os.path.join(example_showcase_dir, "make_video_talk_rap_multii"), ext=".mp4"
|
152 |
+
),
|
153 |
+
"dubbing_superpowerman": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "dubbing_superpowerman"), ext=".mp4"),
|
154 |
+
"make_image_talk_selfie": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "make_image_talk_selfie"), ext=".mp4"),
|
155 |
+
"dubbing_coffee": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "dubbing_coffee"), ext=".mp4"),
|
156 |
+
}
|
157 |
+
|
158 |
+
|
159 |
+
def update_voices(media_path):
|
160 |
+
def get_category(media_path):
|
161 |
+
if media_path:
|
162 |
+
for fterm in female_terms:
|
163 |
+
if fterm in media_path or fterm.lower() in media_path:
|
164 |
+
return female_key
|
165 |
+
|
166 |
+
for mterm in male_terms:
|
167 |
+
if mterm in media_path or mterm.lower() in media_path:
|
168 |
+
return male_key
|
169 |
+
|
170 |
+
return unknown_key
|
171 |
+
|
172 |
+
category = get_category(media_path)
|
173 |
+
driving_input_voice = gr.Dropdown(
|
174 |
+
choices=voices[category],
|
175 |
+
value=voices[category][0],
|
176 |
+
interactive=True,
|
177 |
+
)
|
178 |
+
return driving_input_voice
|
179 |
+
|
180 |
+
|
181 |
+
def task_executor_fn(
|
182 |
+
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
183 |
+
):
|
184 |
+
|
185 |
+
return task_executor.execute_task(
|
186 |
+
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
187 |
+
)
|
188 |
+
|
189 |
+
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
|
190 |
+
with gr.Row():
|
191 |
+
# Step 1: Choose Image
|
192 |
+
with gr.Column(scale=4):
|
193 |
+
gr.Markdown("### Step 1: Choose Image")
|
194 |
+
gr.Markdown("Upload or select an example image to drive.")
|
195 |
+
with gr.Accordion(open=True, label="Base Image"):
|
196 |
+
base_image_input = gr.Image(type="filepath", sources="upload", height=media_height)
|
197 |
+
gr.Examples(
|
198 |
+
examples=[[example] for example in example_base_images[female_key]],
|
199 |
+
inputs=[base_image_input],
|
200 |
+
cache_examples=False,
|
201 |
+
label="Female",
|
202 |
+
)
|
203 |
+
gr.Examples(
|
204 |
+
examples=[[example] for example in example_base_images[male_key]],
|
205 |
+
inputs=[base_image_input],
|
206 |
+
cache_examples=False,
|
207 |
+
label="Male",
|
208 |
+
)
|
209 |
+
|
210 |
+
# Step 2: Motion and Audio/TTS
|
211 |
+
with gr.Column(scale=4):
|
212 |
+
gr.Markdown("### Step 2: Motion and Audio/TTS")
|
213 |
+
gr.Markdown("Select motion and provide audio or text for lip-sync.")
|
214 |
+
with gr.Accordion(open=True, label="Base Motion"):
|
215 |
+
base_motion_expression = gr.Radio(
|
216 |
+
choices=valid_base_motion_expressions,
|
217 |
+
label="Select base motion",
|
218 |
+
value=get_default_base_motion_expression(),
|
219 |
+
)
|
220 |
+
with gr.Tabs():
|
221 |
+
with gr.TabItem("Driving Audio: File") as tab_audio_file:
|
222 |
+
with gr.Accordion(open=True, label="Driving Audio: From File"):
|
223 |
+
driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
|
224 |
+
gr.Examples(
|
225 |
+
examples=[[example] for example in example_driving_audios[female_key]],
|
226 |
+
inputs=[driving_audio_input],
|
227 |
+
cache_examples=False,
|
228 |
+
examples_per_page=18,
|
229 |
+
label="Female",
|
230 |
+
)
|
231 |
+
gr.Examples(
|
232 |
+
examples=[[example] for example in example_driving_audios[male_key]],
|
233 |
+
inputs=[driving_audio_input],
|
234 |
+
cache_examples=False,
|
235 |
+
examples_per_page=18,
|
236 |
+
label="Male",
|
237 |
+
)
|
238 |
+
|
239 |
+
with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
|
240 |
+
with gr.Accordion(open=True, label="Driving Audio: From Text"):
|
241 |
+
driving_input_voice = gr.Dropdown(
|
242 |
+
choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
|
243 |
+
)
|
244 |
+
driving_text_input = gr.Textbox(
|
245 |
+
label="Input Text (300 characters max)",
|
246 |
+
lines=2,
|
247 |
+
)
|
248 |
+
for group in audio_text_groups:
|
249 |
+
gr.Examples(
|
250 |
+
examples=[[example] for example in example_driving_audio_texts[group]],
|
251 |
+
inputs=[driving_text_input],
|
252 |
+
cache_examples=False,
|
253 |
+
label=group,
|
254 |
+
)
|
255 |
+
|
256 |
+
# Step 3: Result
|
257 |
+
with gr.Column(scale=4):
|
258 |
+
gr.Markdown("### Step 3: Result")
|
259 |
+
gr.Markdown("Generate and view the output video.")
|
260 |
+
process_button_animation = gr.Button("🌟 Generate", variant="primary")
|
261 |
+
output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
|
262 |
+
message = gr.Textbox(label="Info")
|
263 |
+
process_button_reset = gr.ClearButton(
|
264 |
+
[
|
265 |
+
base_image_input,
|
266 |
+
driving_audio_input,
|
267 |
+
driving_text_input,
|
268 |
+
driving_input_voice,
|
269 |
+
output_video_i2v,
|
270 |
+
],
|
271 |
+
value="🧹 Clear",
|
272 |
+
)
|
273 |
+
|
274 |
+
base_image_input.change(fn=update_voices, inputs=[base_image_input], outputs=[driving_input_voice])
|
275 |
+
|
276 |
+
# binding functions for buttons
|
277 |
+
process_button_animation.click(
|
278 |
+
fn=task_executor_fn,
|
279 |
+
inputs=[
|
280 |
+
base_image_input,
|
281 |
+
base_motion_expression,
|
282 |
+
driving_audio_input,
|
283 |
+
driving_text_input,
|
284 |
+
driving_input_voice,
|
285 |
+
],
|
286 |
+
outputs=[output_video_i2v, output_video_i2v, message],
|
287 |
+
show_progress=True,
|
288 |
+
)
|
289 |
+
|
290 |
+
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_video:
|
291 |
+
with gr.Row():
|
292 |
+
# Step 1: Choose Video
|
293 |
+
with gr.Column(scale=4):
|
294 |
+
gr.Markdown("### Step 1: Choose Video")
|
295 |
+
gr.Markdown("Upload or select an example video to drive.")
|
296 |
+
with gr.Accordion(open=True, label="Base Video"):
|
297 |
+
base_video_input = gr.Video(sources="upload", height=media_height, interactive=True)
|
298 |
+
gr.Examples(
|
299 |
+
examples=[[example] for example in example_source_videos[female_key]],
|
300 |
+
inputs=[base_video_input],
|
301 |
+
cache_examples=False,
|
302 |
+
label="Female",
|
303 |
+
)
|
304 |
+
gr.Examples(
|
305 |
+
examples=[[example] for example in example_source_videos[male_key]],
|
306 |
+
inputs=[base_video_input],
|
307 |
+
cache_examples=False,
|
308 |
+
label="Male",
|
309 |
+
)
|
310 |
+
|
311 |
+
# Step 2: Audio/TTS
|
312 |
+
with gr.Column(scale=4):
|
313 |
+
gr.Markdown("### Step 2: Audio/TTS")
|
314 |
+
gr.Markdown("Provide audio or text for lip-sync.")
|
315 |
+
with gr.Tabs():
|
316 |
+
with gr.TabItem("Driving Audio: File") as tab_audio_file:
|
317 |
+
with gr.Accordion(open=True, label="Driving Audio: From File"):
|
318 |
+
driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
|
319 |
+
gr.Examples(
|
320 |
+
examples=[[example] for example in example_driving_audios[female_key]],
|
321 |
+
inputs=[driving_audio_input],
|
322 |
+
cache_examples=False,
|
323 |
+
examples_per_page=18,
|
324 |
+
label="Female",
|
325 |
+
)
|
326 |
+
gr.Examples(
|
327 |
+
examples=[[example] for example in example_driving_audios[male_key]],
|
328 |
+
inputs=[driving_audio_input],
|
329 |
+
cache_examples=False,
|
330 |
+
examples_per_page=18,
|
331 |
+
label="Male",
|
332 |
+
)
|
333 |
+
with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
|
334 |
+
with gr.Accordion(open=True, label="Driving Audio: From Text"):
|
335 |
+
driving_input_voice = gr.Dropdown(
|
336 |
+
choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
|
337 |
+
)
|
338 |
+
driving_text_input = gr.Textbox(
|
339 |
+
label="Input Text (300 characters max)",
|
340 |
+
lines=2,
|
341 |
+
)
|
342 |
+
for group in audio_text_groups:
|
343 |
+
gr.Examples(
|
344 |
+
examples=[[example] for example in example_driving_audio_texts[group]],
|
345 |
+
inputs=[driving_text_input],
|
346 |
+
cache_examples=False,
|
347 |
+
label=group,
|
348 |
+
)
|
349 |
+
# Step 3: Result
|
350 |
+
with gr.Column(scale=4):
|
351 |
+
gr.Markdown("### Step 3: Result")
|
352 |
+
gr.Markdown("Generate and view the output video.")
|
353 |
+
process_button_animation = gr.Button("🌟 Generate", variant="primary")
|
354 |
+
output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
|
355 |
+
message = gr.Textbox(label="Info")
|
356 |
+
process_button_reset = gr.ClearButton(
|
357 |
+
[base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v],
|
358 |
+
value="🧹 Clear",
|
359 |
+
)
|
360 |
+
|
361 |
+
base_video_input.change(fn=update_voices, inputs=[base_video_input], outputs=[driving_input_voice])
|
362 |
+
|
363 |
+
# binding functions for buttons
|
364 |
+
base_motion_expression = gr.Radio(value=None, visible=False)
|
365 |
+
process_button_animation.click(
|
366 |
+
fn=task_executor_fn,
|
367 |
+
inputs=[
|
368 |
+
base_video_input,
|
369 |
+
base_motion_expression,
|
370 |
+
driving_audio_input,
|
371 |
+
driving_text_input,
|
372 |
+
driving_input_voice,
|
373 |
+
],
|
374 |
+
outputs=[output_video_i2v, output_video_i2v, message],
|
375 |
+
show_progress=True,
|
376 |
+
)
|
377 |
+
|
378 |
+
with gr.Blocks() as showcase_examples:
|
379 |
+
gr.Markdown("# Make Image Talk")
|
380 |
+
with gr.Row():
|
381 |
+
with gr.Column(scale=7):
|
382 |
+
for path in examples_showcase["make_image_talk_multilingual"]:
|
383 |
+
gr.Video(value=path, label=os.path.basename(path), height=300)
|
384 |
+
with gr.Column(scale=3):
|
385 |
+
for path in examples_showcase["make_image_talk_cartoon"]:
|
386 |
+
gr.Video(value=path, label=os.path.basename(path), height=616)
|
387 |
+
with gr.Row():
|
388 |
+
with gr.Column(scale=7):
|
389 |
+
for path in examples_showcase["make_image_talk_diff_angles"]:
|
390 |
+
gr.Video(value=path, label=os.path.basename(path), height=350)
|
391 |
+
with gr.Column(scale=3):
|
392 |
+
for path in examples_showcase["make_image_talk_hb"]:
|
393 |
+
gr.Video(value=path, label=os.path.basename(path), height=350)
|
394 |
+
with gr.Row():
|
395 |
+
for path in examples_showcase['make_image_talk_selfie']:
|
396 |
+
gr.Video(value=path, label=os.path.basename(path), height=430)
|
397 |
+
|
398 |
+
gr.Markdown("# Make Video Talk")
|
399 |
+
with gr.Row():
|
400 |
+
with gr.Column(scale=7):
|
401 |
+
for path in examples_showcase["make_video_talk_multilingual"]:
|
402 |
+
gr.Video(value=path, label=os.path.basename(path), height=300)
|
403 |
+
with gr.Column(scale=3):
|
404 |
+
for path in examples_showcase["make_video_talk_corp_msg"]:
|
405 |
+
gr.Video(value=path, label=os.path.basename(path), height=616)
|
406 |
+
with gr.Row():
|
407 |
+
for path in examples_showcase["make_video_talk_rap_multii"]:
|
408 |
+
gr.Video(value=path, label=os.path.basename(path), height=500)
|
409 |
+
|
410 |
+
gr.Markdown("# Dubbing")
|
411 |
+
with gr.Row():
|
412 |
+
for path in examples_showcase["dubbing_superpowerman"]:
|
413 |
+
gr.Video(value=path, label=os.path.basename(path), height=320)
|
414 |
+
with gr.Row():
|
415 |
+
for path in examples_showcase["dubbing_coffee"]:
|
416 |
+
gr.Video(value=path, label=os.path.basename(path), height=440)
|
417 |
+
|
418 |
+
with gr.Blocks(analytics_enabled=False, css="footer{display:none !important}", title="SUTRA Avatar v2") as demo:
|
419 |
+
gr.Markdown(
|
420 |
+
"""
|
421 |
+
## <img src="https://playground.two.ai/sutra.svg" height="20"/>
|
422 |
+
"""
|
423 |
+
)
|
424 |
+
title = "# 🌟 SUTRA Avatar v2 🌟\n## Drive Image or Video with LipSync from Audio or Text"
|
425 |
+
gr.Markdown(title)
|
426 |
+
gr.Markdown(description)
|
427 |
+
|
428 |
+
gr.TabbedInterface(
|
429 |
+
interface_list=[demo_image, demo_video, showcase_examples],
|
430 |
+
tab_names=["Drive Image", "Drive Video", "Showcase Examples"],
|
431 |
+
)
|
432 |
+
|
433 |
+
if __name__ == "__main__":
|
434 |
+
parser = argparse.ArgumentParser(description="SUTRA AVATAR CLIENT")
|
435 |
+
args = parser.parse_args()
|
436 |
+
task_executor = CloudTaskExecutor()
|
437 |
+
|
438 |
+
demo.queue(default_concurrency_limit=10).launch(
|
439 |
+
server_name="0.0.0.0",
|
440 |
+
allowed_paths=["/"],
|
441 |
+
)
|
base_task_executor.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import re
|
4 |
+
import shutil
|
5 |
+
import time
|
6 |
+
from abc import ABC, abstractmethod
|
7 |
+
from datetime import datetime
|
8 |
+
from pathlib import Path
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
from elevenlabs_helper import ElevenLabsHelper
|
13 |
+
|
14 |
+
# ---
|
15 |
+
talk_key = "talk"
|
16 |
+
|
17 |
+
# ---
|
18 |
+
|
19 |
+
valid_image_exts = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
|
20 |
+
|
21 |
+
|
22 |
+
def is_image(file_path):
|
23 |
+
return file_path.lower().endswith(valid_image_exts)
|
24 |
+
|
25 |
+
|
26 |
+
def get_formatted_datetime_name() -> str:
|
27 |
+
d = datetime.now()
|
28 |
+
return d.strftime("d%y%m%d" + "-" + "t%H%M%S")
|
29 |
+
|
30 |
+
|
31 |
+
def get_name_ext(filepath):
|
32 |
+
filepath = os.path.abspath(filepath)
|
33 |
+
_, name_ext = os.path.split(filepath)
|
34 |
+
name, ext = os.path.splitext(name_ext)
|
35 |
+
return name, ext
|
36 |
+
|
37 |
+
|
38 |
+
def sanitize_string(string):
|
39 |
+
sanitized_string = re.sub(r"[^A-Za-z0-9]", "", string)
|
40 |
+
max_len = 15
|
41 |
+
return sanitized_string[:max_len]
|
42 |
+
|
43 |
+
|
44 |
+
def get_output_video_name(
|
45 |
+
input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path, tag=""
|
46 |
+
):
|
47 |
+
if not tag:
|
48 |
+
tag = get_formatted_datetime_name()
|
49 |
+
|
50 |
+
base_name, _ = get_name_ext(input_base_path)
|
51 |
+
base_name = sanitize_string(base_name)
|
52 |
+
|
53 |
+
driving_name = ""
|
54 |
+
if input_driving_path:
|
55 |
+
driving_name, _ = get_name_ext(input_driving_path)
|
56 |
+
driving_name = sanitize_string(driving_name)
|
57 |
+
elif base_motion_expression and is_image(input_base_path):
|
58 |
+
driving_name = base_motion_expression
|
59 |
+
|
60 |
+
audio_name = ""
|
61 |
+
if input_driving_audio_path:
|
62 |
+
audio_name, _ = get_name_ext(input_driving_audio_path)
|
63 |
+
audio_name = sanitize_string(audio_name)
|
64 |
+
|
65 |
+
output_video_name = f"{tag}--b-{base_name}"
|
66 |
+
|
67 |
+
if driving_name:
|
68 |
+
output_video_name += f"--d-{driving_name}"
|
69 |
+
|
70 |
+
if audio_name:
|
71 |
+
output_video_name += f"--a-{audio_name}"
|
72 |
+
return output_video_name
|
73 |
+
|
74 |
+
|
75 |
+
def generate_random_integer(num_digits):
|
76 |
+
current_time = int(time.time() * 1000)
|
77 |
+
random.seed(current_time)
|
78 |
+
lower_bound = 0
|
79 |
+
upper_bound = (10**num_digits) - 1
|
80 |
+
return random.randint(lower_bound, upper_bound)
|
81 |
+
|
82 |
+
|
83 |
+
def get_unique_name(maxd=4, delim="-"):
|
84 |
+
pid = os.getpid()
|
85 |
+
pid_str = str(pid)[-maxd:]
|
86 |
+
|
87 |
+
time_ns = time.time_ns()
|
88 |
+
time_str = str(time_ns)[-maxd:]
|
89 |
+
|
90 |
+
rint = generate_random_integer(maxd)
|
91 |
+
rint_str = str(rint).zfill(maxd)
|
92 |
+
return delim.join([pid_str, time_str, rint_str])
|
93 |
+
|
94 |
+
|
95 |
+
def mkdir_p(path: str) -> None:
|
96 |
+
if not Path(path).exists():
|
97 |
+
Path(path).mkdir(parents=True)
|
98 |
+
|
99 |
+
|
100 |
+
# ---
|
101 |
+
|
102 |
+
|
103 |
+
class BaseTaskExecutor(ABC):
|
104 |
+
def __init__(self):
|
105 |
+
self.tmp_dir = "/tmp/gradio"
|
106 |
+
|
107 |
+
def execute_task(
|
108 |
+
self, input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
109 |
+
):
|
110 |
+
tag = get_unique_name()
|
111 |
+
output_dir = os.path.join(self.tmp_dir, tag)
|
112 |
+
mkdir_p(output_dir)
|
113 |
+
|
114 |
+
do_dafile = input_driving_audio_path is not None and os.path.exists(input_driving_audio_path)
|
115 |
+
do_datts = driving_text_input and driving_voice_input
|
116 |
+
do_talk = do_dafile or do_datts
|
117 |
+
|
118 |
+
if base_motion_expression:
|
119 |
+
if talk_key not in base_motion_expression and do_talk:
|
120 |
+
gr.Warning(
|
121 |
+
f"Ignoring Driving Audio since expressive Base Motion selected: {base_motion_expression}")
|
122 |
+
do_dafile = False
|
123 |
+
do_datts = False
|
124 |
+
do_talk = False
|
125 |
+
|
126 |
+
if talk_key in base_motion_expression and not do_talk:
|
127 |
+
gr.Warning(f"Selected talking Base Motion but no Driving Audio")
|
128 |
+
else:
|
129 |
+
base_motion_expression = ""
|
130 |
+
|
131 |
+
if do_datts:
|
132 |
+
if do_dafile:
|
133 |
+
gr.Warning("Ignoring Audio File input since TTS is selected.\nClear the undesired input if this is not intended.")
|
134 |
+
output_audio_file = os.path.join(f"{output_dir}/{tag}.mp3")
|
135 |
+
ElevenLabsHelper.generate_voice(driving_text_input, driving_voice_input, output_audio_file)
|
136 |
+
input_driving_audio_path = output_audio_file
|
137 |
+
|
138 |
+
if not do_talk:
|
139 |
+
input_driving_audio_path = ""
|
140 |
+
|
141 |
+
if input_base_path is not None and os.path.exists(input_base_path):
|
142 |
+
input_driving_path = ""
|
143 |
+
request_id = get_unique_name(maxd=8, delim="")
|
144 |
+
output_video_path = os.path.join(
|
145 |
+
self.tmp_dir,
|
146 |
+
get_output_video_name(
|
147 |
+
input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path
|
148 |
+
)
|
149 |
+
+ ".mp4",
|
150 |
+
)
|
151 |
+
result, output_video_path = self.generate(
|
152 |
+
input_base_path,
|
153 |
+
input_driving_path,
|
154 |
+
base_motion_expression,
|
155 |
+
input_driving_audio_path,
|
156 |
+
output_video_path,
|
157 |
+
request_id,
|
158 |
+
)
|
159 |
+
success = result["success"]
|
160 |
+
messages = result["messages"]
|
161 |
+
|
162 |
+
self.clean(output_dir)
|
163 |
+
|
164 |
+
if success:
|
165 |
+
return output_video_path, gr.update(visible=True), messages
|
166 |
+
else:
|
167 |
+
gr.Info("Task could not be completed", duration=4)
|
168 |
+
return None, gr.update(visible=False), f"ERROR\n\n{messages}"
|
169 |
+
else:
|
170 |
+
self.clean(output_dir)
|
171 |
+
raise gr.Error("No source selected!", duration=6)
|
172 |
+
|
173 |
+
@abstractmethod
|
174 |
+
def generate(self):
|
175 |
+
pass
|
176 |
+
|
177 |
+
def clean(self, output_dir):
|
178 |
+
if os.path.isdir(output_dir):
|
179 |
+
shutil.rmtree(output_dir)
|
cloud_task_executor.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import json
|
3 |
+
import ntpath
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import requests
|
9 |
+
from google.cloud import storage
|
10 |
+
|
11 |
+
from base_task_executor import BaseTaskExecutor
|
12 |
+
|
13 |
+
# ---
|
14 |
+
enc = "utf-8"
|
15 |
+
|
16 |
+
|
17 |
+
def decode(string):
|
18 |
+
return json.loads(base64.b64decode(string.encode(enc)).decode(enc))
|
19 |
+
|
20 |
+
|
21 |
+
def get_storage_client_from_env():
|
22 |
+
credentials_json = decode(os.environ["GCP_API_KEY"])
|
23 |
+
return storage.Client.from_service_account_info(credentials_json)
|
24 |
+
|
25 |
+
|
26 |
+
def get_name_ext(filepath):
|
27 |
+
filepath = os.path.abspath(filepath)
|
28 |
+
_, name_ext = os.path.split(filepath)
|
29 |
+
name, ext = os.path.splitext(name_ext)
|
30 |
+
return name, ext
|
31 |
+
|
32 |
+
|
33 |
+
def make_remote_media_path(request_id, media_path):
|
34 |
+
assert len(request_id) > 6
|
35 |
+
assert os.path.exists(media_path)
|
36 |
+
src_id = request_id[:3]
|
37 |
+
slot_id = request_id[3:6]
|
38 |
+
request_suffix = request_id[6:]
|
39 |
+
name, ext = get_name_ext(media_path)
|
40 |
+
return os.path.join(src_id, slot_id, request_suffix, name + ext)
|
41 |
+
|
42 |
+
|
43 |
+
def copy_file_to_gcloud(bucket, local_file_path, remote_file_path):
|
44 |
+
blob = bucket.blob(remote_file_path)
|
45 |
+
blob.upload_from_filename(local_file_path)
|
46 |
+
|
47 |
+
def copy_to_gcloud(storage_client, local_media_path, bucket_name, remote_media_path):
|
48 |
+
bucket = storage_client.get_bucket(bucket_name)
|
49 |
+
copy_file_to_gcloud(bucket, local_media_path, remote_media_path)
|
50 |
+
|
51 |
+
|
52 |
+
# ---
|
53 |
+
|
54 |
+
|
55 |
+
class CloudTaskExecutor(BaseTaskExecutor):
|
56 |
+
def __init__(self):
|
57 |
+
super().__init__()
|
58 |
+
self.base_url = os.getenv("SUTRA_AVATAR_BASE_URL")
|
59 |
+
self.headers = {"Authorization": f'{os.getenv("SUTRA_AVATAR_API_KEY")}', "Content-Type": "application/json"}
|
60 |
+
self.bucket_name = os.getenv("SUTRA_AVATAR_BUCKET_NAME")
|
61 |
+
self.storage_client = get_storage_client_from_env()
|
62 |
+
|
63 |
+
def submit_task(self, submit_request):
|
64 |
+
url = f"{self.base_url}/task/submit"
|
65 |
+
response = requests.post(url, json=submit_request, headers=self.headers)
|
66 |
+
if response.status_code == 200:
|
67 |
+
return response.json()
|
68 |
+
else:
|
69 |
+
response.raise_for_status()
|
70 |
+
|
71 |
+
def get_task_status(self, request_id):
|
72 |
+
url = f"{self.base_url}/task/status"
|
73 |
+
response = requests.get(url, params={"rid": request_id}, headers=self.headers)
|
74 |
+
if response.status_code == 200:
|
75 |
+
return response.json()
|
76 |
+
else:
|
77 |
+
response.raise_for_status()
|
78 |
+
|
79 |
+
def generate(
|
80 |
+
self,
|
81 |
+
input_base_path,
|
82 |
+
input_driving_path,
|
83 |
+
base_motion_expression,
|
84 |
+
input_driving_audio_path,
|
85 |
+
output_video_path,
|
86 |
+
request_id,
|
87 |
+
):
|
88 |
+
|
89 |
+
# Upload files
|
90 |
+
media_paths = [input_base_path, input_driving_audio_path]
|
91 |
+
for media_path in media_paths:
|
92 |
+
if media_path:
|
93 |
+
remote_media_path = make_remote_media_path(request_id, media_path)
|
94 |
+
copy_to_gcloud(self.storage_client, media_path, self.bucket_name, remote_media_path)
|
95 |
+
|
96 |
+
submit_request = {
|
97 |
+
"requestId": request_id,
|
98 |
+
"input_base_path": ntpath.basename(input_base_path),
|
99 |
+
"input_driving_path": "",
|
100 |
+
"base_motion_expression": base_motion_expression,
|
101 |
+
"input_driving_audio_path": ntpath.basename(input_driving_audio_path),
|
102 |
+
"output_video_path": ntpath.basename(output_video_path),
|
103 |
+
}
|
104 |
+
submit_reply = self.submit_task(submit_request)
|
105 |
+
estimatedWaitSeconds = "unknown"
|
106 |
+
if "estimatedWaitSeconds" in submit_reply.keys():
|
107 |
+
estimatedWaitSeconds = submit_reply["estimatedWaitSeconds"]
|
108 |
+
|
109 |
+
completion_statuses = {"Succeeded", "Cancelled", "Failed", "NotFound"}
|
110 |
+
timeout = 240 # maximum time to wait in seconds
|
111 |
+
if isinstance(estimatedWaitSeconds, int):
|
112 |
+
timeout += estimatedWaitSeconds
|
113 |
+
start_time = time.time()
|
114 |
+
|
115 |
+
result = {"messages": ''}
|
116 |
+
while True:
|
117 |
+
status_reply = self.get_task_status(request_id)
|
118 |
+
task_status = status_reply["taskStatus"]
|
119 |
+
|
120 |
+
if status_reply["taskStatus"] in completion_statuses:
|
121 |
+
break
|
122 |
+
|
123 |
+
if time.time() - start_time > timeout:
|
124 |
+
msg = "The task did not complete within the timeout period.\n The server is very busy serving other requests.\n Please try again."
|
125 |
+
result["success"] = False
|
126 |
+
result["messages"] = msg
|
127 |
+
gr.Error(msg)
|
128 |
+
break
|
129 |
+
time.sleep(3)
|
130 |
+
|
131 |
+
task_status = status_reply["taskStatus"]
|
132 |
+
if task_status == "Succeeded":
|
133 |
+
pipe_reply = status_reply["pipeReply"]
|
134 |
+
result["success"] = pipe_reply["status"] == "success"
|
135 |
+
result["messages"] = pipe_reply["messages"]
|
136 |
+
output_video_path = status_reply["videoURL"]
|
137 |
+
else:
|
138 |
+
messages = ""
|
139 |
+
if "pipeReply" in status_reply.keys():
|
140 |
+
messages = status_reply["pipeReply"]["messages"]
|
141 |
+
result["success"] = False
|
142 |
+
result["messages"] += messages
|
143 |
+
return result, output_video_path
|
data/input_audio/gradio/female/en-BeesWingsBeat-Shelby.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2a85a13e25fb823143e26a39ce6de823861199b90784db4461a243d01f87201
|
3 |
+
size 55588
|
data/input_audio/gradio/female/en-EnhanceEfficiency-Shelby.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35135724f58a72574c9f92e5bcdec1c41eac7f02480fc306b648263f0750a742
|
3 |
+
size 60604
|
data/input_audio/gradio/female/en-The2026WorldCup-Shelby.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c4054806558c0f2b26313a5b352b042fdc7dba0c90eac36e9c0c667dd00bcf3
|
3 |
+
size 71053
|
data/input_audio/gradio/female/hi-BeesWingsBeat-Matilda.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:981852faccc81eccf82effc8ad3a2bef134c447c038ec15c4c7ff418c1a40c25
|
3 |
+
size 57678
|
data/input_audio/gradio/female/hi-EnhanceEfficiency-Matilda.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:568d0dd0fad0648e711fa50e0c048cad18df52e03a87503ff382379686acf89b
|
3 |
+
size 48065
|
data/input_audio/gradio/female/hi-The2026WorldCup-Matilda.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a304d592f4d2b10a91f7b82b25416813ca891b50e64fb513aa7f3cf1b8f0cd7c
|
3 |
+
size 53498
|
data/input_audio/gradio/female/ko-BeesWingsBeat-Jinju.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13e97b106f1757b8f64cecb33ae9265eaba0dfa5a28bb6f27d1f42534937f203
|
3 |
+
size 47229
|
data/input_audio/gradio/female/ko-EnhanceEfficiency-Jinju.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34144828e9499c22fa5d7be6a621aadea5f0a25d68dca04a6ad3b65f01dfa36d
|
3 |
+
size 48065
|
data/input_audio/gradio/female/ko-The2026WorldCup-Jinju.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:60f6dc9a567be17f2edc9d4fa5e877a4025e7acabdc4260014612b420f7b2981
|
3 |
+
size 57678
|
data/input_audio/gradio/male/en-BeesWingsBeat-Marcus.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b677ad256f0d28d1c9c9afabb347d7b1520aadd1b0e19ca09665fe3b9a7adfed
|
3 |
+
size 46811
|
data/input_audio/gradio/male/en-EnhanceEfficiency-Marcus.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:770cce3bbfca0913ceb8651584d6515c8f271bffb45d11e0f76ecf96af19e00a
|
3 |
+
size 40542
|
data/input_audio/gradio/male/en-The2026WorldCup-Marcus.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86f34c9f42944b8a76cc727c06f28556630d94a82304b37890919cb64d8cab51
|
3 |
+
size 57260
|
data/input_audio/gradio/male/hi-BeesWingsBeat-Liam.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f767e72ca739f8e3ba3edea24f5f9b533bfdbef37c60db02125dd1c18d54a1ef
|
3 |
+
size 64365
|
data/input_audio/gradio/male/hi-EnhanceEfficiency-Liam.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcdb3e0776e8aa60778d97dc9a73beaa81b6b94a2b31cf4e34437fdc12233425
|
3 |
+
size 50991
|
data/input_audio/gradio/male/hi-The2026WorldCup-Liam.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f292ae2e165b6fb713807888ab604848bf02f162f1621d47cd06bfc1926dd7f
|
3 |
+
size 54752
|
data/input_audio/gradio/male/ko-BeesWingsBeat-Noah.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4bdcbdf30de7b6fbadd04099c08e47812311aeb1fcc5bb2c87ac4d92ab5d9a90
|
3 |
+
size 47229
|
data/input_audio/gradio/male/ko-EnhanceEfficiency-Noah.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be33748c1b19c74abb6f2daaa343d4c5c2c5c8c00a7a03d2fbc20ca8e08ef9a6
|
3 |
+
size 44303
|
data/input_audio/gradio/male/ko-The2026WorldCup-Noah.mp3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a4ea9b5d46d6e419b59b875b0b84203170736a394e4eb676e7da70af8261d64
|
3 |
+
size 58514
|
data/input_image_bases/female/01-Female-American_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/female/02-Female-Indian01_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/female/03-Female-Korean_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/female/04-Female-Indian02_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/female/05-Female-European_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/male/01-Male-Indian_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/male/02-Male-Korean_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/male/03-Male-European_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/male/04-Male-American_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_image_bases/male/05-Male-AfricanAmerican_608.jpg
ADDED
![]() |
Git LFS Details
|
data/input_video_bases/female/01-Female-Korean_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ecf7828e7d0f421767d190b3555868728b184edac1f4a0201820f1c58865d7c
|
3 |
+
size 2000776
|
data/input_video_bases/female/02-Female-Latina_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6884cd58b987f02443d83b3faae37951aa33a689245c3bf65725f609c6303789
|
3 |
+
size 2666194
|
data/input_video_bases/female/03-Female-European_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbde154264db6fbcb94e3c93c529b365f67e667473cc8a1445e0e9223ce6ea8b
|
3 |
+
size 1625368
|
data/input_video_bases/female/04-Female-Indian_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a3a358644c023f7cde032e5570d9b39b615b594d8ab6747456a2c60ac9a1f1c
|
3 |
+
size 1529791
|
data/input_video_bases/female/05-Female-American_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35a91366a511a6b27f15edca2b5b6428e1ea3781971c9ac4202a34c49c0cef89
|
3 |
+
size 1903512
|
data/input_video_bases/male/01-Male-Japanese_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9325107bacf0442932b74f88fc861a008fbbf4770f32074a0f818cc7f69c1759
|
3 |
+
size 1770959
|
data/input_video_bases/male/02-Male-European_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0eb1e61a0b6f22a4fcfd3acb90c5e661396678fcde7eca3edd394f1223483ea
|
3 |
+
size 1693659
|
data/input_video_bases/male/03-Male-American02_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68c9427293f6b721ac180f596b71ea4df1e5a5f5d3938f7ac9ac16df2007562f
|
3 |
+
size 1927639
|
data/input_video_bases/male/04-Male-Indian_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e12f8f6c70d602ad8c8f422ffd703a6c012b453d9902245b82b4ae0c051397d6
|
3 |
+
size 1352685
|
data/input_video_bases/male/05-Male-American_608.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:448f993473f7a8291f8591856e15701c7e9bb373ddbf9e9c8a773d69b84601ac
|
3 |
+
size 1854230
|
data/showcase_examples/archive/01 Multilingual Female_720.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c67441bab5596482bfcb40c725c0829fb7b4df1a5642e43661b6553b20cefed2
|
3 |
+
size 17771532
|
data/showcase_examples/archive/02 Multilingual Male_720.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82605475898eddb08165ec3429bb933e94a765d23c8c7a4ef1ecfa70363a4638
|
3 |
+
size 13215459
|
data/showcase_examples/archive/02 Multilingual Male_720_IM.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d63481b053c30f05600791361914e9d2f7a17d003da56d1776f319622d8ec0a3
|
3 |
+
size 17479793
|
data/showcase_examples/archive/03 Corporate Message_720.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:917db888f55ada94ee47b2f05a0ed2274f71d750b25f3c11ae5e9bc4b86a663c
|
3 |
+
size 2930433
|
data/showcase_examples/archive/04 Multi-Identities: Multilingual_720.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:414fd98b0816cbd2834353b87dcb3e3f41e3c47423c0b50040a79461c225f500
|
3 |
+
size 5313472
|
data/showcase_examples/archive/05 Multi-Identities: Rap_720.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efcc23f689bc5067a30ab46efaa6d546c46cf422427dbb058fde6b8be066fbd3
|
3 |
+
size 2556681
|