Spaces:

TWO
/

sutra-avatar-v2

Running on CPU Upgrade

App Files Files Community

Michael Sapienza commited on Oct 30, 2024

Commit

ec17e66

0 Parent(s):

initial commit of sutra-avatar-v2

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
.gitignore +1 -0
README.md +13 -0
app.py +441 -0
base_task_executor.py +179 -0
cloud_task_executor.py +143 -0
data/input_audio/gradio/female/en-BeesWingsBeat-Shelby.mp3 +3 -0
data/input_audio/gradio/female/en-EnhanceEfficiency-Shelby.mp3 +3 -0
data/input_audio/gradio/female/en-The2026WorldCup-Shelby.mp3 +3 -0
data/input_audio/gradio/female/hi-BeesWingsBeat-Matilda.mp3 +3 -0
data/input_audio/gradio/female/hi-EnhanceEfficiency-Matilda.mp3 +3 -0
data/input_audio/gradio/female/hi-The2026WorldCup-Matilda.mp3 +3 -0
data/input_audio/gradio/female/ko-BeesWingsBeat-Jinju.mp3 +3 -0
data/input_audio/gradio/female/ko-EnhanceEfficiency-Jinju.mp3 +3 -0
data/input_audio/gradio/female/ko-The2026WorldCup-Jinju.mp3 +3 -0
data/input_audio/gradio/male/en-BeesWingsBeat-Marcus.mp3 +3 -0
data/input_audio/gradio/male/en-EnhanceEfficiency-Marcus.mp3 +3 -0
data/input_audio/gradio/male/en-The2026WorldCup-Marcus.mp3 +3 -0
data/input_audio/gradio/male/hi-BeesWingsBeat-Liam.mp3 +3 -0
data/input_audio/gradio/male/hi-EnhanceEfficiency-Liam.mp3 +3 -0
data/input_audio/gradio/male/hi-The2026WorldCup-Liam.mp3 +3 -0
data/input_audio/gradio/male/ko-BeesWingsBeat-Noah.mp3 +3 -0
data/input_audio/gradio/male/ko-EnhanceEfficiency-Noah.mp3 +3 -0
data/input_audio/gradio/male/ko-The2026WorldCup-Noah.mp3 +3 -0
data/input_image_bases/female/01-Female-American_608.jpg +3 -0
data/input_image_bases/female/02-Female-Indian01_608.jpg +3 -0
data/input_image_bases/female/03-Female-Korean_608.jpg +3 -0
data/input_image_bases/female/04-Female-Indian02_608.jpg +3 -0
data/input_image_bases/female/05-Female-European_608.jpg +3 -0
data/input_image_bases/male/01-Male-Indian_608.jpg +3 -0
data/input_image_bases/male/02-Male-Korean_608.jpg +3 -0
data/input_image_bases/male/03-Male-European_608.jpg +3 -0
data/input_image_bases/male/04-Male-American_608.jpg +3 -0
data/input_image_bases/male/05-Male-AfricanAmerican_608.jpg +3 -0
data/input_video_bases/female/01-Female-Korean_608.mp4 +3 -0
data/input_video_bases/female/02-Female-Latina_608.mp4 +3 -0
data/input_video_bases/female/03-Female-European_608.mp4 +3 -0
data/input_video_bases/female/04-Female-Indian_608.mp4 +3 -0
data/input_video_bases/female/05-Female-American_608.mp4 +3 -0
data/input_video_bases/male/01-Male-Japanese_608.mp4 +3 -0
data/input_video_bases/male/02-Male-European_608.mp4 +3 -0
data/input_video_bases/male/03-Male-American02_608.mp4 +3 -0
data/input_video_bases/male/04-Male-Indian_608.mp4 +3 -0
data/input_video_bases/male/05-Male-American_608.mp4 +3 -0
data/showcase_examples/archive/01 Multilingual Female_720.mp4 +3 -0
data/showcase_examples/archive/02 Multilingual Male_720.mp4 +3 -0
data/showcase_examples/archive/02 Multilingual Male_720_IM.mp4 +3 -0
data/showcase_examples/archive/03 Corporate Message_720.mp4 +3 -0
data/showcase_examples/archive/04 Multi-Identities: Multilingual_720.mp4 +3 -0
data/showcase_examples/archive/05 Multi-Identities: Rap_720.mp4 +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: sutra-avatar-v2
+emoji: 🐨
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 5.3.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,441 @@

+# coding: utf-8
+import argparse
+import glob
+import os
+from pathlib import Path
+import gradio as gr
+from cloud_task_executor import CloudTaskExecutor
+from elevenlabs_helper import ElevenLabsHelper
+# ---
+talk_key = "talk"
+valid_base_motion_expressions = [
+    f"{talk_key}-head",
+    f"{talk_key}-neutral",
+    "smile",
+    "approve",
+    "disapprove",
+    "confused",
+    "sad",
+    "surprised",
+]
+def get_default_base_motion_expression():
+    return valid_base_motion_expressions[0]
+# ---
+def get_sorted_filenames_in_dir(dir_path: str, ext: str = ".jpg", throw_if_empty: bool = True) -> list:
+    """Return the sorted filenames in the spedified directory."""
+    p = Path(dir_path)
+    if not p.exists() and not p.is_dir():
+        raise RuntimeError(f"The path: {dir_path} does not exist")
+    if not os.listdir(dir_path):
+        message = f"The path: {dir_path} is empty"
+        if throw_if_empty:
+            raise RuntimeError(message)
+        else:
+            return []
+    search_string = str(dir_path) + "/*" + ext
+    return sorted(glob.glob(search_string))
+# ---
+description = """Experience a demo of the world's most advanced Text/Audio To Video (TTV) system, crafted by Two AI.
+                 Sign up with Two AI to gain rapid, long-form generation, API keys, and more!"""
+# Core constants
+tmp_dir = "/tmp/gradio"
+data_dir = "./data"
+male_key = "male"
+female_key = "female"
+unknown_key = "unknown"
+media_height = 512
+# Male/Female
+female_terms = ["Female", "Lady", "Woman"]
+male_terms = ["Male", "Lad", "Man"]
+# Elevenlabs Voices #
+all_voices = ElevenLabsHelper.get_voices()
+voices_ = [voice for voice in all_voices.voices if len(voice.name.split(" ")) < 2 and len(voice.name) < 10]
+female_voice_names = ElevenLabsHelper.select_voices(voices_, labels={"gender": female_key, "age": "young"})
+male_voice_names = ElevenLabsHelper.select_voices(voices_, labels={"gender": male_key, "age": "young"})
+male_voice_names.remove("Priya")
+voices = {
+    female_key: female_voice_names,
+    male_key: male_voice_names,
+    unknown_key: female_voice_names + male_voice_names,
+}
+# Examples
+# Base Images
+example_base_image_dir = os.path.join(data_dir, "input_image_bases")
+example_base_images = {
+    female_key: get_sorted_filenames_in_dir(os.path.join(example_base_image_dir, female_key), ext=".jpg"),
+    male_key: get_sorted_filenames_in_dir(os.path.join(example_base_image_dir, male_key), ext=".jpg"),
+}
+# Base Videos
+example_base_video_dir = os.path.join(data_dir, "input_video_bases")
+example_source_videos = {
+    female_key: get_sorted_filenames_in_dir(os.path.join(example_base_video_dir, female_key), ext=".mp4"),
+    male_key: get_sorted_filenames_in_dir(os.path.join(example_base_video_dir, male_key), ext=".mp4"),
+}
+# Driving Audio
+example_driving_audio_dir = os.path.join(data_dir, "input_audio/gradio")
+example_driving_audios_male = get_sorted_filenames_in_dir(os.path.join(example_driving_audio_dir, male_key), ext=".mp3")
+example_driving_audios_female = get_sorted_filenames_in_dir(
+    os.path.join(example_driving_audio_dir, female_key), ext=".mp3"
+)
+example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
+# Driving Text
+audio_text_groups = ["General", "Promotional Messages", "Pronunciation Practice"]
+example_driving_audio_texts = {
+    "General": [
+        "The 2026 World Cup final match is in New York.",
+        "Enhance efficiency and cut costs with AI.",
+        "A bee's wings beat more than 200 times per second.",
+        "2026년 월드컵 결승전은 뉴욕에서 열립니다.",
+        "AI로 효율성을 높이고 비용을 절감하세요.",
+        "벌은 초당 200회 이상의 날개짓을 합니다.",
+        "2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
+        "AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
+        "मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
+    ],
+    "Promotional Messages": [
+        "Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
+        "Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
+        "This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
+        "Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
+    ],
+    "Pronunciation Practice": [
+        "A big black bug bit a big black dog on his big black nose.",
+        "Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
+    ],
+}
+example_showcase_dir = os.path.join(data_dir, "showcase_examples")
+examples_showcase = {
+    "make_image_talk_multilingual": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_image_talk_multilingual"), ext=".mp4"
+    ),
+    "make_image_talk_cartoon": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_image_talk_cartoon"), ext=".mp4"
+    ),
+    "make_image_talk_diff_angles": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_image_talk_diff_angles"), ext=".mp4"
+    ),
+    "make_image_talk_hb": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_image_talk_hb"), ext=".mp4"
+    ),
+    "make_video_talk_multilingual": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_video_talk_multilingual"), ext=".mp4"
+    ),
+    "make_video_talk_corp_msg": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_video_talk_corp_msg"), ext=".mp4"
+    ),
+    "make_video_talk_rap_multii": get_sorted_filenames_in_dir(
+        os.path.join(example_showcase_dir, "make_video_talk_rap_multii"), ext=".mp4"
+    ),
+    "dubbing_superpowerman": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "dubbing_superpowerman"), ext=".mp4"),
+    "make_image_talk_selfie": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "make_image_talk_selfie"), ext=".mp4"),
+    "dubbing_coffee": get_sorted_filenames_in_dir(os.path.join(example_showcase_dir, "dubbing_coffee"), ext=".mp4"),
+}
+def update_voices(media_path):
+    def get_category(media_path):
+        if media_path:
+            for fterm in female_terms:
+                if fterm in media_path or fterm.lower() in media_path:
+                    return female_key
+            for mterm in male_terms:
+                if mterm in media_path or mterm.lower() in media_path:
+                    return male_key
+        return unknown_key
+    category = get_category(media_path)
+    driving_input_voice = gr.Dropdown(
+        choices=voices[category],
+        value=voices[category][0],
+        interactive=True,
+    )
+    return driving_input_voice
+def task_executor_fn(
+    input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
+):
+    return task_executor.execute_task(
+        input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
+    )
+with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
+    with gr.Row():
+        # Step 1: Choose Image
+        with gr.Column(scale=4):
+            gr.Markdown("### Step 1: Choose Image")
+            gr.Markdown("Upload or select an example image to drive.")
+            with gr.Accordion(open=True, label="Base Image"):
+                base_image_input = gr.Image(type="filepath", sources="upload", height=media_height)
+                gr.Examples(
+                    examples=[[example] for example in example_base_images[female_key]],
+                    inputs=[base_image_input],
+                    cache_examples=False,
+                    label="Female",
+                )
+                gr.Examples(
+                    examples=[[example] for example in example_base_images[male_key]],
+                    inputs=[base_image_input],
+                    cache_examples=False,
+                    label="Male",
+                )
+        # Step 2: Motion and Audio/TTS
+        with gr.Column(scale=4):
+            gr.Markdown("### Step 2: Motion and Audio/TTS")
+            gr.Markdown("Select motion and provide audio or text for lip-sync.")
+            with gr.Accordion(open=True, label="Base Motion"):
+                base_motion_expression = gr.Radio(
+                    choices=valid_base_motion_expressions,
+                    label="Select base motion",
+                    value=get_default_base_motion_expression(),
+                )
+            with gr.Tabs():
+                with gr.TabItem("Driving Audio: File") as tab_audio_file:
+                    with gr.Accordion(open=True, label="Driving Audio: From File"):
+                        driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
+                        gr.Examples(
+                            examples=[[example] for example in example_driving_audios[female_key]],
+                            inputs=[driving_audio_input],
+                            cache_examples=False,
+                            examples_per_page=18,
+                            label="Female",
+                        )
+                        gr.Examples(
+                            examples=[[example] for example in example_driving_audios[male_key]],
+                            inputs=[driving_audio_input],
+                            cache_examples=False,
+                            examples_per_page=18,
+                            label="Male",
+                        )
+                with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
+                    with gr.Accordion(open=True, label="Driving Audio: From Text"):
+                        driving_input_voice = gr.Dropdown(
+                            choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
+                        )
+                        driving_text_input = gr.Textbox(
+                            label="Input Text (300 characters max)",
+                            lines=2,
+                        )
+                        for group in audio_text_groups:
+                            gr.Examples(
+                                examples=[[example] for example in example_driving_audio_texts[group]],
+                                inputs=[driving_text_input],
+                                cache_examples=False,
+                                label=group,
+                            )
+        # Step 3: Result
+        with gr.Column(scale=4):
+            gr.Markdown("### Step 3: Result")
+            gr.Markdown("Generate and view the output video.")
+            process_button_animation = gr.Button("🌟 Generate", variant="primary")
+            output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
+            message = gr.Textbox(label="Info")
+            process_button_reset = gr.ClearButton(
+                [
+                    base_image_input,
+                    driving_audio_input,
+                    driving_text_input,
+                    driving_input_voice,
+                    output_video_i2v,
+                ],
+                value="🧹 Clear",
+            )
+    base_image_input.change(fn=update_voices, inputs=[base_image_input], outputs=[driving_input_voice])
+    # binding functions for buttons
+    process_button_animation.click(
+        fn=task_executor_fn,
+        inputs=[
+            base_image_input,
+            base_motion_expression,
+            driving_audio_input,
+            driving_text_input,
+            driving_input_voice,
+        ],
+        outputs=[output_video_i2v, output_video_i2v, message],
+        show_progress=True,
+    )
+with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_video:
+    with gr.Row():
+        # Step 1: Choose Video
+        with gr.Column(scale=4):
+            gr.Markdown("### Step 1: Choose Video")
+            gr.Markdown("Upload or select an example video to drive.")
+            with gr.Accordion(open=True, label="Base Video"):
+                base_video_input = gr.Video(sources="upload", height=media_height, interactive=True)
+                gr.Examples(
+                    examples=[[example] for example in example_source_videos[female_key]],
+                    inputs=[base_video_input],
+                    cache_examples=False,
+                    label="Female",
+                )
+                gr.Examples(
+                    examples=[[example] for example in example_source_videos[male_key]],
+                    inputs=[base_video_input],
+                    cache_examples=False,
+                    label="Male",
+                )
+        # Step 2: Audio/TTS
+        with gr.Column(scale=4):
+            gr.Markdown("### Step 2: Audio/TTS")
+            gr.Markdown("Provide audio or text for lip-sync.")
+            with gr.Tabs():
+                with gr.TabItem("Driving Audio: File") as tab_audio_file:
+                    with gr.Accordion(open=True, label="Driving Audio: From File"):
+                        driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
+                        gr.Examples(
+                            examples=[[example] for example in example_driving_audios[female_key]],
+                            inputs=[driving_audio_input],
+                            cache_examples=False,
+                            examples_per_page=18,
+                            label="Female",
+                        )
+                        gr.Examples(
+                            examples=[[example] for example in example_driving_audios[male_key]],
+                            inputs=[driving_audio_input],
+                            cache_examples=False,
+                            examples_per_page=18,
+                            label="Male",
+                        )
+                with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
+                    with gr.Accordion(open=True, label="Driving Audio: From Text"):
+                        driving_input_voice = gr.Dropdown(
+                            choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
+                        )
+                        driving_text_input = gr.Textbox(
+                            label="Input Text (300 characters max)",
+                            lines=2,
+                        )
+                        for group in audio_text_groups:
+                            gr.Examples(
+                                examples=[[example] for example in example_driving_audio_texts[group]],
+                                inputs=[driving_text_input],
+                                cache_examples=False,
+                                label=group,
+                            )
+        # Step 3: Result
+        with gr.Column(scale=4):
+            gr.Markdown("### Step 3: Result")
+            gr.Markdown("Generate and view the output video.")
+            process_button_animation = gr.Button("🌟 Generate", variant="primary")
+            output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
+            message = gr.Textbox(label="Info")
+            process_button_reset = gr.ClearButton(
+                [base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v],
+                value="🧹 Clear",
+            )
+    base_video_input.change(fn=update_voices, inputs=[base_video_input], outputs=[driving_input_voice])
+    # binding functions for buttons
+    base_motion_expression = gr.Radio(value=None, visible=False)
+    process_button_animation.click(
+        fn=task_executor_fn,
+        inputs=[
+            base_video_input,
+            base_motion_expression,
+            driving_audio_input,
+            driving_text_input,
+            driving_input_voice,
+        ],
+        outputs=[output_video_i2v, output_video_i2v, message],
+        show_progress=True,
+    )
+with gr.Blocks() as showcase_examples:
+    gr.Markdown("# Make Image Talk")
+    with gr.Row():
+        with gr.Column(scale=7):
+            for path in examples_showcase["make_image_talk_multilingual"]:
+                gr.Video(value=path, label=os.path.basename(path), height=300)
+        with gr.Column(scale=3):
+            for path in examples_showcase["make_image_talk_cartoon"]:
+                gr.Video(value=path, label=os.path.basename(path), height=616)
+    with gr.Row():
+        with gr.Column(scale=7):
+            for path in examples_showcase["make_image_talk_diff_angles"]:
+                gr.Video(value=path, label=os.path.basename(path), height=350)
+        with gr.Column(scale=3):
+            for path in examples_showcase["make_image_talk_hb"]:
+                gr.Video(value=path, label=os.path.basename(path), height=350)
+    with gr.Row():
+        for path in examples_showcase['make_image_talk_selfie']:
+            gr.Video(value=path, label=os.path.basename(path), height=430)
+    gr.Markdown("# Make Video Talk")
+    with gr.Row():
+        with gr.Column(scale=7):
+            for path in examples_showcase["make_video_talk_multilingual"]:
+                gr.Video(value=path, label=os.path.basename(path), height=300)
+        with gr.Column(scale=3):
+            for path in examples_showcase["make_video_talk_corp_msg"]:
+                gr.Video(value=path, label=os.path.basename(path), height=616)
+    with gr.Row():
+        for path in examples_showcase["make_video_talk_rap_multii"]:
+            gr.Video(value=path, label=os.path.basename(path), height=500)
+    gr.Markdown("# Dubbing")
+    with gr.Row():
+        for path in examples_showcase["dubbing_superpowerman"]:
+            gr.Video(value=path, label=os.path.basename(path), height=320)
+    with gr.Row():
+        for path in examples_showcase["dubbing_coffee"]:
+            gr.Video(value=path, label=os.path.basename(path), height=440)
+with gr.Blocks(analytics_enabled=False, css="footer{display:none !important}", title="SUTRA Avatar v2") as demo:
+    gr.Markdown(
+        """
+        ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
+        """
+    )
+    title = "# 🌟 SUTRA Avatar v2 🌟\n## Drive Image or Video with LipSync from Audio or Text"
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.TabbedInterface(
+        interface_list=[demo_image, demo_video, showcase_examples],
+        tab_names=["Drive Image", "Drive Video", "Showcase Examples"],
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SUTRA AVATAR CLIENT")
+    args = parser.parse_args()
+    task_executor = CloudTaskExecutor()
+    demo.queue(default_concurrency_limit=10).launch(
+        server_name="0.0.0.0",
+        allowed_paths=["/"],
+    )

base_task_executor.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import random
+import re
+import shutil
+import time
+from abc import ABC, abstractmethod
+from datetime import datetime
+from pathlib import Path
+import gradio as gr
+from elevenlabs_helper import ElevenLabsHelper
+# ---
+talk_key = "talk"
+# ---
+valid_image_exts = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
+def is_image(file_path):
+    return file_path.lower().endswith(valid_image_exts)
+def get_formatted_datetime_name() -> str:
+    d = datetime.now()
+    return d.strftime("d%y%m%d" + "-" + "t%H%M%S")
+def get_name_ext(filepath):
+    filepath = os.path.abspath(filepath)
+    _, name_ext = os.path.split(filepath)
+    name, ext = os.path.splitext(name_ext)
+    return name, ext
+def sanitize_string(string):
+    sanitized_string = re.sub(r"[^A-Za-z0-9]", "", string)
+    max_len = 15
+    return sanitized_string[:max_len]
+def get_output_video_name(
+    input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path, tag=""
+):
+    if not tag:
+        tag = get_formatted_datetime_name()
+    base_name, _ = get_name_ext(input_base_path)
+    base_name = sanitize_string(base_name)
+    driving_name = ""
+    if input_driving_path:
+        driving_name, _ = get_name_ext(input_driving_path)
+        driving_name = sanitize_string(driving_name)
+    elif base_motion_expression and is_image(input_base_path):
+        driving_name = base_motion_expression
+    audio_name = ""
+    if input_driving_audio_path:
+        audio_name, _ = get_name_ext(input_driving_audio_path)
+        audio_name = sanitize_string(audio_name)
+    output_video_name = f"{tag}--b-{base_name}"
+    if driving_name:
+        output_video_name += f"--d-{driving_name}"
+    if audio_name:
+        output_video_name += f"--a-{audio_name}"
+    return output_video_name
+def generate_random_integer(num_digits):
+    current_time = int(time.time() * 1000)
+    random.seed(current_time)
+    lower_bound = 0
+    upper_bound = (10**num_digits) - 1
+    return random.randint(lower_bound, upper_bound)
+def get_unique_name(maxd=4, delim="-"):
+    pid = os.getpid()
+    pid_str = str(pid)[-maxd:]
+    time_ns = time.time_ns()
+    time_str = str(time_ns)[-maxd:]
+    rint = generate_random_integer(maxd)
+    rint_str = str(rint).zfill(maxd)
+    return delim.join([pid_str, time_str, rint_str])
+def mkdir_p(path: str) -> None:
+    if not Path(path).exists():
+        Path(path).mkdir(parents=True)
+# ---
+class BaseTaskExecutor(ABC):
+    def __init__(self):
+        self.tmp_dir = "/tmp/gradio"
+    def execute_task(
+        self, input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
+    ):
+        tag = get_unique_name()
+        output_dir = os.path.join(self.tmp_dir, tag)
+        mkdir_p(output_dir)
+        do_dafile = input_driving_audio_path is not None and os.path.exists(input_driving_audio_path)
+        do_datts = driving_text_input and driving_voice_input
+        do_talk = do_dafile or do_datts
+        if base_motion_expression:
+            if talk_key not in base_motion_expression and do_talk:
+                gr.Warning(
+                    f"Ignoring Driving Audio since expressive Base Motion selected: {base_motion_expression}")
+                do_dafile = False
+                do_datts = False
+                do_talk = False
+            if talk_key in base_motion_expression and not do_talk:
+                gr.Warning(f"Selected talking Base Motion but no Driving Audio")
+        else:
+            base_motion_expression = ""
+        if do_datts:
+            if do_dafile:
+                gr.Warning("Ignoring Audio File input since TTS is selected.\nClear the undesired input if this is not intended.")
+            output_audio_file = os.path.join(f"{output_dir}/{tag}.mp3")
+            ElevenLabsHelper.generate_voice(driving_text_input, driving_voice_input, output_audio_file)
+            input_driving_audio_path = output_audio_file
+        if not do_talk:
+            input_driving_audio_path = ""
+        if input_base_path is not None and os.path.exists(input_base_path):
+            input_driving_path = ""
+            request_id = get_unique_name(maxd=8, delim="")
+            output_video_path = os.path.join(
+                self.tmp_dir,
+                get_output_video_name(
+                    input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path
+                )
+                + ".mp4",
+            )
+            result, output_video_path = self.generate(
+                input_base_path,
+                input_driving_path,
+                base_motion_expression,
+                input_driving_audio_path,
+                output_video_path,
+                request_id,
+            )
+            success = result["success"]
+            messages = result["messages"]
+            self.clean(output_dir)
+            if success:
+                return output_video_path, gr.update(visible=True), messages
+            else:
+                gr.Info("Task could not be completed", duration=4)
+                return None, gr.update(visible=False), f"ERROR\n\n{messages}"
+        else:
+            self.clean(output_dir)
+            raise gr.Error("No source selected!", duration=6)
+    @abstractmethod
+    def generate(self):
+        pass
+    def clean(self, output_dir):
+        if os.path.isdir(output_dir):
+            shutil.rmtree(output_dir)

cloud_task_executor.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import base64
+import json
+import ntpath
+import os
+import time
+import gradio as gr
+import requests
+from google.cloud import storage
+from base_task_executor import BaseTaskExecutor
+# ---
+enc = "utf-8"
+def decode(string):
+    return json.loads(base64.b64decode(string.encode(enc)).decode(enc))
+def get_storage_client_from_env():
+    credentials_json = decode(os.environ["GCP_API_KEY"])
+    return storage.Client.from_service_account_info(credentials_json)
+def get_name_ext(filepath):
+    filepath = os.path.abspath(filepath)
+    _, name_ext = os.path.split(filepath)
+    name, ext = os.path.splitext(name_ext)
+    return name, ext
+def make_remote_media_path(request_id, media_path):
+    assert len(request_id) > 6
+    assert os.path.exists(media_path)
+    src_id = request_id[:3]
+    slot_id = request_id[3:6]
+    request_suffix = request_id[6:]
+    name, ext = get_name_ext(media_path)
+    return os.path.join(src_id, slot_id, request_suffix, name + ext)
+def copy_file_to_gcloud(bucket, local_file_path, remote_file_path):
+    blob = bucket.blob(remote_file_path)
+    blob.upload_from_filename(local_file_path)
+def copy_to_gcloud(storage_client, local_media_path, bucket_name, remote_media_path):
+    bucket = storage_client.get_bucket(bucket_name)
+    copy_file_to_gcloud(bucket, local_media_path, remote_media_path)
+# ---
+class CloudTaskExecutor(BaseTaskExecutor):
+    def __init__(self):
+        super().__init__()
+        self.base_url = os.getenv("SUTRA_AVATAR_BASE_URL")
+        self.headers = {"Authorization": f'{os.getenv("SUTRA_AVATAR_API_KEY")}', "Content-Type": "application/json"}
+        self.bucket_name = os.getenv("SUTRA_AVATAR_BUCKET_NAME")
+        self.storage_client = get_storage_client_from_env()
+    def submit_task(self, submit_request):
+        url = f"{self.base_url}/task/submit"
+        response = requests.post(url, json=submit_request, headers=self.headers)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            response.raise_for_status()
+    def get_task_status(self, request_id):
+        url = f"{self.base_url}/task/status"
+        response = requests.get(url, params={"rid": request_id}, headers=self.headers)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            response.raise_for_status()
+    def generate(
+        self,
+        input_base_path,
+        input_driving_path,
+        base_motion_expression,
+        input_driving_audio_path,
+        output_video_path,
+        request_id,
+    ):
+        # Upload files
+        media_paths = [input_base_path, input_driving_audio_path]
+        for media_path in media_paths:
+            if media_path:
+                remote_media_path = make_remote_media_path(request_id, media_path)
+                copy_to_gcloud(self.storage_client, media_path, self.bucket_name, remote_media_path)
+        submit_request = {
+            "requestId": request_id,
+            "input_base_path": ntpath.basename(input_base_path),
+            "input_driving_path": "",
+            "base_motion_expression": base_motion_expression,
+            "input_driving_audio_path": ntpath.basename(input_driving_audio_path),
+            "output_video_path": ntpath.basename(output_video_path),
+        }
+        submit_reply = self.submit_task(submit_request)
+        estimatedWaitSeconds = "unknown"
+        if "estimatedWaitSeconds" in submit_reply.keys():
+            estimatedWaitSeconds = submit_reply["estimatedWaitSeconds"]
+        completion_statuses = {"Succeeded", "Cancelled", "Failed", "NotFound"}
+        timeout = 240  # maximum time to wait in seconds
+        if isinstance(estimatedWaitSeconds, int):
+            timeout += estimatedWaitSeconds
+        start_time = time.time()
+        result = {"messages": ''}
+        while True:
+            status_reply = self.get_task_status(request_id)
+            task_status = status_reply["taskStatus"]
+            if status_reply["taskStatus"] in completion_statuses:
+                break
+            if time.time() - start_time > timeout:
+                msg = "The task did not complete within the timeout period.\n The server is very busy serving other requests.\n Please try again."
+                result["success"] = False
+                result["messages"] = msg
+                gr.Error(msg)
+                break
+            time.sleep(3)
+        task_status = status_reply["taskStatus"]
+        if task_status == "Succeeded":
+            pipe_reply = status_reply["pipeReply"]
+            result["success"] = pipe_reply["status"] == "success"
+            result["messages"] = pipe_reply["messages"]
+            output_video_path = status_reply["videoURL"]
+        else:
+            messages = ""
+            if "pipeReply" in status_reply.keys():
+                messages = status_reply["pipeReply"]["messages"]
+            result["success"] = False
+            result["messages"] += messages
+        return result, output_video_path

data/input_audio/gradio/female/en-BeesWingsBeat-Shelby.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2a85a13e25fb823143e26a39ce6de823861199b90784db4461a243d01f87201
+size 55588

data/input_audio/gradio/female/en-EnhanceEfficiency-Shelby.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35135724f58a72574c9f92e5bcdec1c41eac7f02480fc306b648263f0750a742
+size 60604

data/input_audio/gradio/female/en-The2026WorldCup-Shelby.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4054806558c0f2b26313a5b352b042fdc7dba0c90eac36e9c0c667dd00bcf3
+size 71053

data/input_audio/gradio/female/hi-BeesWingsBeat-Matilda.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:981852faccc81eccf82effc8ad3a2bef134c447c038ec15c4c7ff418c1a40c25
+size 57678

data/input_audio/gradio/female/hi-EnhanceEfficiency-Matilda.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:568d0dd0fad0648e711fa50e0c048cad18df52e03a87503ff382379686acf89b
+size 48065

data/input_audio/gradio/female/hi-The2026WorldCup-Matilda.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a304d592f4d2b10a91f7b82b25416813ca891b50e64fb513aa7f3cf1b8f0cd7c
+size 53498

data/input_audio/gradio/female/ko-BeesWingsBeat-Jinju.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13e97b106f1757b8f64cecb33ae9265eaba0dfa5a28bb6f27d1f42534937f203
+size 47229

data/input_audio/gradio/female/ko-EnhanceEfficiency-Jinju.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34144828e9499c22fa5d7be6a621aadea5f0a25d68dca04a6ad3b65f01dfa36d
+size 48065

data/input_audio/gradio/female/ko-The2026WorldCup-Jinju.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60f6dc9a567be17f2edc9d4fa5e877a4025e7acabdc4260014612b420f7b2981
+size 57678

data/input_audio/gradio/male/en-BeesWingsBeat-Marcus.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b677ad256f0d28d1c9c9afabb347d7b1520aadd1b0e19ca09665fe3b9a7adfed
+size 46811

data/input_audio/gradio/male/en-EnhanceEfficiency-Marcus.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:770cce3bbfca0913ceb8651584d6515c8f271bffb45d11e0f76ecf96af19e00a
+size 40542

data/input_audio/gradio/male/en-The2026WorldCup-Marcus.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86f34c9f42944b8a76cc727c06f28556630d94a82304b37890919cb64d8cab51
+size 57260

data/input_audio/gradio/male/hi-BeesWingsBeat-Liam.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f767e72ca739f8e3ba3edea24f5f9b533bfdbef37c60db02125dd1c18d54a1ef
+size 64365

data/input_audio/gradio/male/hi-EnhanceEfficiency-Liam.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcdb3e0776e8aa60778d97dc9a73beaa81b6b94a2b31cf4e34437fdc12233425
+size 50991

data/input_audio/gradio/male/hi-The2026WorldCup-Liam.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f292ae2e165b6fb713807888ab604848bf02f162f1621d47cd06bfc1926dd7f
+size 54752

data/input_audio/gradio/male/ko-BeesWingsBeat-Noah.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bdcbdf30de7b6fbadd04099c08e47812311aeb1fcc5bb2c87ac4d92ab5d9a90
+size 47229

data/input_audio/gradio/male/ko-EnhanceEfficiency-Noah.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be33748c1b19c74abb6f2daaa343d4c5c2c5c8c00a7a03d2fbc20ca8e08ef9a6
+size 44303

data/input_audio/gradio/male/ko-The2026WorldCup-Noah.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a4ea9b5d46d6e419b59b875b0b84203170736a394e4eb676e7da70af8261d64
+size 58514

data/input_image_bases/female/01-Female-American_608.jpg ADDED Viewed

Git LFS Details

SHA256: b8f78b7e0c7e81ed5aa6d94d6f7a4197921363ec1e35e69c1d38b9095be8469c
Pointer size: 131 Bytes
Size of remote file: 228 kB

data/input_image_bases/female/02-Female-Indian01_608.jpg ADDED Viewed

Git LFS Details

SHA256: 8743f2c7b9c5f09bacd5aca74ae983a2c83a9501e1af0c5b4765c33f80286b51
Pointer size: 131 Bytes
Size of remote file: 211 kB

data/input_image_bases/female/03-Female-Korean_608.jpg ADDED Viewed

Git LFS Details

SHA256: b4b77e5db173fd080841145a82b2937fd5365f0ff1563762aed43789cbb865da
Pointer size: 131 Bytes
Size of remote file: 214 kB

data/input_image_bases/female/04-Female-Indian02_608.jpg ADDED Viewed

Git LFS Details

SHA256: 0db7c64c33a13797a5c72b427df2803c45560a9a7cc606897a91f1b4a81aee69
Pointer size: 131 Bytes
Size of remote file: 245 kB

data/input_image_bases/female/05-Female-European_608.jpg ADDED Viewed

Git LFS Details

SHA256: 736a1a4f72f8c221bdae6a9cc438e1bb6058892353c8611d2cc4731eba2bfa0d
Pointer size: 131 Bytes
Size of remote file: 250 kB

data/input_image_bases/male/01-Male-Indian_608.jpg ADDED Viewed

Git LFS Details

SHA256: 816d36bbb50acbacebf74d0e0c9f1a9fe5b39c37d6f40c612a7b67dc02ffe772
Pointer size: 131 Bytes
Size of remote file: 214 kB

data/input_image_bases/male/02-Male-Korean_608.jpg ADDED Viewed

Git LFS Details

SHA256: 0178b68fb104f30efb198ee96ac4ba41dbedf516fed306c73fd9548d68adb4fd
Pointer size: 131 Bytes
Size of remote file: 225 kB

data/input_image_bases/male/03-Male-European_608.jpg ADDED Viewed

Git LFS Details

SHA256: eae049f326bdaa5d966285d2cbd8429dcb7c48ab91a55126b38781be65673b98
Pointer size: 131 Bytes
Size of remote file: 272 kB

data/input_image_bases/male/04-Male-American_608.jpg ADDED Viewed

Git LFS Details

SHA256: 5a2d59c6418c2bb8490265d0fd261f2c1ec0e50e09fbf61abaef2e57ef870b8d
Pointer size: 131 Bytes
Size of remote file: 242 kB

data/input_image_bases/male/05-Male-AfricanAmerican_608.jpg ADDED Viewed

Git LFS Details

SHA256: 9b84bc585d8ea0d6303bac4b82ef587df6c1bf03c2e445474554c4c4abbc4bc4
Pointer size: 131 Bytes
Size of remote file: 205 kB

data/input_video_bases/female/01-Female-Korean_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ecf7828e7d0f421767d190b3555868728b184edac1f4a0201820f1c58865d7c
+size 2000776

data/input_video_bases/female/02-Female-Latina_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6884cd58b987f02443d83b3faae37951aa33a689245c3bf65725f609c6303789
+size 2666194

data/input_video_bases/female/03-Female-European_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbde154264db6fbcb94e3c93c529b365f67e667473cc8a1445e0e9223ce6ea8b
+size 1625368

data/input_video_bases/female/04-Female-Indian_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a3a358644c023f7cde032e5570d9b39b615b594d8ab6747456a2c60ac9a1f1c
+size 1529791

data/input_video_bases/female/05-Female-American_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35a91366a511a6b27f15edca2b5b6428e1ea3781971c9ac4202a34c49c0cef89
+size 1903512

data/input_video_bases/male/01-Male-Japanese_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9325107bacf0442932b74f88fc861a008fbbf4770f32074a0f818cc7f69c1759
+size 1770959

data/input_video_bases/male/02-Male-European_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0eb1e61a0b6f22a4fcfd3acb90c5e661396678fcde7eca3edd394f1223483ea
+size 1693659

data/input_video_bases/male/03-Male-American02_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68c9427293f6b721ac180f596b71ea4df1e5a5f5d3938f7ac9ac16df2007562f
+size 1927639

data/input_video_bases/male/04-Male-Indian_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e12f8f6c70d602ad8c8f422ffd703a6c012b453d9902245b82b4ae0c051397d6
+size 1352685

data/input_video_bases/male/05-Male-American_608.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:448f993473f7a8291f8591856e15701c7e9bb373ddbf9e9c8a773d69b84601ac
+size 1854230

data/showcase_examples/archive/01 Multilingual Female_720.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c67441bab5596482bfcb40c725c0829fb7b4df1a5642e43661b6553b20cefed2
+size 17771532

data/showcase_examples/archive/02 Multilingual Male_720.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82605475898eddb08165ec3429bb933e94a765d23c8c7a4ef1ecfa70363a4638
+size 13215459

data/showcase_examples/archive/02 Multilingual Male_720_IM.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d63481b053c30f05600791361914e9d2f7a17d003da56d1776f319622d8ec0a3
+size 17479793

data/showcase_examples/archive/03 Corporate Message_720.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:917db888f55ada94ee47b2f05a0ed2274f71d750b25f3c11ae5e9bc4b86a663c
+size 2930433

data/showcase_examples/archive/04 Multi-Identities: Multilingual_720.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:414fd98b0816cbd2834353b87dcb3e3f41e3c47423c0b50040a79461c225f500
+size 5313472

data/showcase_examples/archive/05 Multi-Identities: Rap_720.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efcc23f689bc5067a30ab46efaa6d546c46cf422427dbb058fde6b8be066fbd3
+size 2556681