Spaces:

jhj0517
/

AdvancedLivePortrait-WebUI

Running

App Files Files Community

jhj0517 commited on Nov 10, 2024

Commit

52ea725

unverified ·

2 Parent(s): f78e590 34efb13

Merge pull request #10 from jhj0517/feature/add-video-source

Browse files

Files changed (13) hide show

.github/workflows/ci.yml +4 -1
.gitignore +2 -0
README.md +1 -1
app.py +77 -39
i18n/translation.yaml +80 -0
modules/live_portrait/live_portrait_inferencer.py +169 -197
modules/utils/constants.py +7 -1
modules/utils/image_helper.py +1 -0
modules/utils/paths.py +9 -2
modules/utils/video_helper.py +315 -0
requirements.txt +7 -1
tests/test_config.py +63 -2
tests/test_video_creation.py +39 -0

.github/workflows/ci.yml CHANGED Viewed

@@ -28,8 +28,11 @@ jobs:
         with:
           python-version: ${{ matrix.python }}
       - name: Install dependencies
-        run: pip install -r requirements.txt pytest
       - name: Run test
         run: python -m pytest -rs tests

         with:
           python-version: ${{ matrix.python }}
+      - name: Install ffmpeg
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg
       - name: Install dependencies
+        run: pip install -r requirements.txt pytest scikit-image moviepy
       - name: Run test
         run: python -m pytest -rs tests

.gitignore CHANGED Viewed

@@ -4,5 +4,7 @@ models/
 outputs/
 *.png
 *.jpg
 **/.pytest_cache

 outputs/
 *.png
 *.jpg
+*.jpeg
+**/__pycache__
 **/.pytest_cache

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ You can try it in Colab
 ### Prerequisite
 1. `3.9` <= `python` <= `3.12` : https://www.python.org/downloads/release/python-3110/
 2. **(Opitonal, only if you're using Nvidia GPU)** CUDA 12.4 : https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows
 ## Run Locally
 1. git clone this repository
 ```

 ### Prerequisite
 1. `3.9` <= `python` <= `3.12` : https://www.python.org/downloads/release/python-3110/
 2. **(Opitonal, only if you're using Nvidia GPU)** CUDA 12.4 : https://developer.nvidia.com/cuda-12-4-0-download-archive?target_os=Windows
+3. (Optional, only needed if you use Video Driven) `FFmpeg`:  https://ffmpeg.org/download.html <br> After installing `FFmpeg`, make sure to add the FFmpeg/bin folder to your **system PATH**!
 ## Run Locally
 1. git clone this repository
 ```

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ class App:
         )
     @staticmethod
-    def create_parameters():
         return [
             gr.Dropdown(label=_("Model Type"), visible=False, interactive=False,
                         choices=[item.value for item in ModelType], value=ModelType.HUMAN.value),
@@ -38,10 +38,21 @@ class App:
             gr.Slider(label=_("WOO"), minimum=-20, maximum=20, step=0.2, value=0),
             gr.Slider(label=_("Smile"), minimum=-2.0, maximum=2.0, step=0.01, value=0),
             gr.Slider(label=_("Source Ratio"), minimum=0, maximum=1, step=0.01, value=1),
-            gr.Slider(label=_("Sample Ratio"), minimum=-0.2, maximum=1.2, step=0.01, value=1),
-            gr.Dropdown(label=_("Sample Parts"),
                         choices=[part.value for part in SamplePart], value=SamplePart.ALL.value),
-            gr.Slider(label=_("Crop Factor"), minimum=1.5, maximum=2.5, step=0.1, value=1.7)
         ]
     def launch(self):
@@ -49,41 +60,68 @@ class App:
             with self.i18n:
                 gr.Markdown(REPO_MARKDOWN, elem_id="md_project")
-                with gr.Row():
-                    with gr.Column():
-                        img_ref = gr.Image(label=_("Reference Image"))
-                with gr.Row():
-                    btn_gen = gr.Button("GENERATE", visible=False)
-                with gr.Row(equal_height=True):
-                    with gr.Column(scale=9):
-                        img_out = gr.Image(label=_("Output Image"))
-                    with gr.Column(scale=1):
-                        expression_parameters = self.create_parameters()
-                        btn_openfolder = gr.Button('📂')
-                        with gr.Accordion("Opt in features", visible=False):
-                            img_sample = gr.Image()
-                            img_motion_link = gr.Image()
-                            tb_exp = gr.Textbox()
-                params = expression_parameters + [img_ref]
-                opt_in_features_params = [img_sample, img_motion_link, tb_exp]
-                gr.on(
-                    triggers=[param.change for param in params],
-                    fn=self.inferencer.edit_expression,
-                    inputs=params + opt_in_features_params,
-                    outputs=img_out,
-                    show_progress="minimal",
-                    queue=True
-                )
-                btn_openfolder.click(
-                    fn=lambda: self.open_folder(self.args.output_dir), inputs=None, outputs=None
-                )
-                btn_gen.click(self.inferencer.edit_expression,
-                              inputs=params + opt_in_features_params,
-                              outputs=img_out)
             gradio_launch_args = {
                 "inbrowser": self.args.inbrowser,

         )
     @staticmethod
+    def create_expression_parameters():
         return [
             gr.Dropdown(label=_("Model Type"), visible=False, interactive=False,
                         choices=[item.value for item in ModelType], value=ModelType.HUMAN.value),
             gr.Slider(label=_("WOO"), minimum=-20, maximum=20, step=0.2, value=0),
             gr.Slider(label=_("Smile"), minimum=-2.0, maximum=2.0, step=0.01, value=0),
             gr.Slider(label=_("Source Ratio"), minimum=0, maximum=1, step=0.01, value=1),
+            gr.Slider(label=_("Sample Ratio"), minimum=-0.2, maximum=1.2, step=0.01, value=1, visible=False),
+            gr.Dropdown(label=_("Sample Parts"), visible=False,
                         choices=[part.value for part in SamplePart], value=SamplePart.ALL.value),
+            gr.Slider(label=_("Face Crop Factor"), minimum=1.5, maximum=2.5, step=0.1, value=2)
+        ]
+    @staticmethod
+    def create_video_parameters():
+        return [
+            gr.Dropdown(label=_("Model Type"), visible=False, interactive=False,
+                        choices=[item.value for item in ModelType],
+                        value=ModelType.HUMAN.value),
+            gr.Slider(label=_("First frame eyes alignment factor"), minimum=0, maximum=1, step=0.01, value=1),
+            gr.Slider(label=_("First frame mouth alignment factor"), minimum=0, maximum=1, step=0.01, value=1),
+            gr.Slider(label=_("Face Crop Factor"), minimum=1.5, maximum=2.5, step=0.1, value=2),
         ]
     def launch(self):
             with self.i18n:
                 gr.Markdown(REPO_MARKDOWN, elem_id="md_project")
+                with gr.Tabs():
+                    with gr.TabItem(_("Expression Editor")):
+                        with gr.Row():
+                            with gr.Column():
+                                img_ref = gr.Image(label=_("Reference Image"))
+                        with gr.Row():
+                            btn_gen = gr.Button("GENERATE", visible=False)
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=9):
+                                img_out = gr.Image(label=_("Output Image"))
+                            with gr.Column(scale=1):
+                                expression_parameters = self.create_expression_parameters()
+                                btn_openfolder = gr.Button('📂')
+                                with gr.Accordion("Opt in features", visible=False):
+                                    img_sample = gr.Image()
+                        params = expression_parameters + [img_ref]
+                        opt_in_features_params = [img_sample]
+                        gr.on(
+                            triggers=[param.change for param in params],
+                            fn=self.inferencer.edit_expression,
+                            inputs=params + opt_in_features_params,
+                            outputs=img_out,
+                            show_progress="minimal",
+                            queue=True
+                        )
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(self.args.output_dir), inputs=None, outputs=None
+                        )
+                        btn_gen.click(self.inferencer.edit_expression,
+                                      inputs=params + opt_in_features_params,
+                                      outputs=img_out)
+                    with gr.TabItem(_("Video Driven")):
+                        with gr.Row():
+                            img_ref = gr.Image(label=_("Reference Image"))
+                            vid_driven = gr.Video(label=_("Expression Video"))
+                            with gr.Column():
+                                vid_params = self.create_video_parameters()
+                        with gr.Row():
+                            btn_gen = gr.Button(_("GENERATE"), variant="primary")
+                        with gr.Row(equal_height=True):
+                            with gr.Column(scale=9):
+                                vid_out = gr.Video(label=_("Output Video"), scale=9)
+                            with gr.Column(scale=1):
+                                btn_openfolder = gr.Button('📂')
+                        params = vid_params + [img_ref, vid_driven]
+                        btn_gen.click(
+                            fn=self.inferencer.create_video,
+                            inputs=params,
+                            outputs=vid_out
+                        )
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "videos")),
+                            inputs=None, outputs=None
+                        )
             gradio_launch_args = {
                 "inbrowser": self.args.inbrowser,

i18n/translation.yaml CHANGED Viewed

@@ -24,6 +24,14 @@ en: # English
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 ko: # Korean
   Language: 언어
@@ -51,6 +59,14 @@ ko: # Korean
   OnlyEyes: 눈만
   All: 전부
   Value above 5 may appear distorted: 5 이상은 왜곡돼 보일 수 있습니다.
 ja: # Japanese
   Language: 言語
@@ -78,6 +94,14 @@ ja: # Japanese
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 es: # Spanish
   Language: Idioma
@@ -105,6 +129,14 @@ es: # Spanish
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 fr: # French
   Language: Langue
@@ -132,6 +164,14 @@ fr: # French
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 de: # German
   Language: Sprache
@@ -159,6 +199,14 @@ de: # German
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 zh: # Chinese
   Language: 语言
@@ -186,6 +234,14 @@ zh: # Chinese
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 uk: # Ukrainian
   Language: Мова
@@ -213,6 +269,14 @@ uk: # Ukrainian
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 ru: # Russian
   Language: Язык
@@ -240,6 +304,14 @@ ru: # Russian
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
 tr: # Turkish
   Language: Dil
@@ -267,3 +339,11 @@ tr: # Turkish
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted

   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 ko: # Korean
   Language: 언어
   OnlyEyes: 눈만
   All: 전부
   Value above 5 may appear distorted: 5 이상은 왜곡돼 보일 수 있습니다.
+  Expression Editor: 표정 편집기
+  Video Driven: 영상 변환
+  Expression Video: 표정 영상
+  GENERATE: 생성
+  Output Video: 결과 영상
+  First frame mouth alignment factor: 첫 프레임 입 반영 비율
+  First frame eyes alignment factor: 첫 프레임 눈 반영 비율
+  Face Crop Factor: 얼굴 크롭 비율
 ja: # Japanese
   Language: 言語
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 es: # Spanish
   Language: Idioma
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 fr: # French
   Language: Langue
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 de: # German
   Language: Sprache
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 zh: # Chinese
   Language: 语言
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 uk: # Ukrainian
   Language: Мова
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 ru: # Russian
   Language: Язык
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor
 tr: # Turkish
   Language: Dil
   OnlyEyes: OnlyEyes
   All: All
   Value above 5 may appear distorted: Value above 5 may appear distorted
+  Expression Editor: Expression Editor
+  Video Driven: Video Driven
+  Expression Video: Expression Video
+  GENERATE: GENERATE
+  Output Video: Output Video
+  First frame mouth alignment factor: First frame mouth alignment factor
+  First frame eyes alignment factor: First frame eyes alignment factor
+  Face Crop Factor: Face Crop Factor

modules/live_portrait/live_portrait_inferencer.py CHANGED Viewed

@@ -4,16 +4,18 @@ import cv2
 import time
 import copy
 import dill
 from ultralytics import YOLO
 import safetensors.torch
 import gradio as gr
 from gradio_i18n import Translate, gettext as _
 from ultralytics.utils import LOGGER as ultralytics_logger
 from enum import Enum
-from typing import Union
 from modules.utils.paths import *
 from modules.utils.image_helper import *
 from modules.live_portrait.model_downloader import *
 from modules.live_portrait.live_portrait_wrapper import LivePortraitWrapper
 from modules.utils.camera import get_rotation_matrix
@@ -32,8 +34,17 @@ class LivePortraitInferencer:
                  model_dir: str = MODELS_DIR,
                  output_dir: str = OUTPUTS_DIR):
         self.model_dir = model_dir
-        os.makedirs(os.path.join(self.model_dir, "animal"), exist_ok=True)
         self.output_dir = output_dir
         self.model_config = load_yaml(MODEL_CONFIG)["model_params"]
         self.appearance_feature_extractor = None
@@ -134,26 +145,24 @@ class LivePortraitInferencer:
     def edit_expression(self,
                         model_type: str = ModelType.HUMAN.value,
-                        rotate_pitch=0,
-                        rotate_yaw=0,
-                        rotate_roll=0,
-                        blink=0,
-                        eyebrow=0,
-                        wink=0,
-                        pupil_x=0,
-                        pupil_y=0,
-                        aaa=0,
-                        eee=0,
-                        woo=0,
-                        smile=0,
-                        src_ratio=1,
-                        sample_ratio=1,
-                        sample_parts="All",
-                        crop_factor=1.5,
-                        src_image=None,
-                        sample_image=None,
-                        motion_link=None,
-                        add_exp=None):
         if isinstance(model_type, ModelType):
             model_type = model_type.value
         if model_type not in [mode.value for mode in ModelType]:
@@ -165,199 +174,158 @@ class LivePortraitInferencer:
             )
         try:
-            rotate_yaw = -rotate_yaw
-            new_editor_link = None
-            if isinstance(motion_link, np.ndarray) and motion_link:
-                self.psi = motion_link[0]
-                new_editor_link = motion_link.copy()
-            elif src_image is not None:
-                if id(src_image) != id(self.src_image) or self.crop_factor != crop_factor:
-                    self.crop_factor = crop_factor
-                    self.psi = self.prepare_source(src_image, crop_factor)
-                    self.src_image = src_image
-                new_editor_link = []
-                new_editor_link.append(self.psi)
-            else:
-                return None
-            psi = self.psi
-            s_info = psi.x_s_info
-            #delta_new = copy.deepcopy()
-            s_exp = s_info['exp'] * src_ratio
-            s_exp[0, 5] = s_info['exp'][0, 5]
-            s_exp += s_info['kp']
-            es = ExpressionSet()
-            if isinstance(sample_image, np.ndarray) and sample_image:
-                if id(self.sample_image) != id(sample_image):
-                    self.sample_image = sample_image
-                    d_image_np = (sample_image * 255).byte().numpy()
-                    d_face = self.crop_face(d_image_np[0], 1.7)
-                    i_d = self.prepare_src_image(d_face)
-                    self.d_info = self.pipeline.get_kp_info(i_d)
-                    self.d_info['exp'][0, 5, 0] = 0
-                    self.d_info['exp'][0, 5, 1] = 0
-                # "OnlyExpression", "OnlyRotation", "OnlyMouth", "OnlyEyes", "All"
-                if sample_parts == SamplePart.ONLY_EXPRESSION.value or sample_parts == SamplePart.ONLY_EXPRESSION.ALL.value:
-                    es.e += self.d_info['exp'] * sample_ratio
-                if sample_parts == SamplePart.ONLY_ROTATION.value or sample_parts == SamplePart.ONLY_ROTATION.ALL.value:
-                    rotate_pitch += self.d_info['pitch'] * sample_ratio
-                    rotate_yaw += self.d_info['yaw'] * sample_ratio
-                    rotate_roll += self.d_info['roll'] * sample_ratio
-                elif sample_parts == SamplePart.ONLY_MOUTH.value:
-                    self.retargeting(es.e, self.d_info['exp'], sample_ratio, (14, 17, 19, 20))
-                elif sample_parts == SamplePart.ONLY_EYES.value:
-                    self.retargeting(es.e, self.d_info['exp'], sample_ratio, (1, 2, 11, 13, 15, 16))
-            es.r = self.calc_fe(es.e, blink, eyebrow, wink, pupil_x, pupil_y, aaa, eee, woo, smile,
-                                rotate_pitch, rotate_yaw, rotate_roll)
-            if isinstance(add_exp, ExpressionSet):
-                es.add(add_exp)
-            new_rotate = get_rotation_matrix(s_info['pitch'] + es.r[0], s_info['yaw'] + es.r[1],
-                                             s_info['roll'] + es.r[2])
-            x_d_new = (s_info['scale'] * (1 + es.s)) * ((s_exp + es.e) @ new_rotate) + s_info['t']
-            x_d_new = self.pipeline.stitching(psi.x_s_user, x_d_new)
-            crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, x_d_new)
-            crop_out = self.pipeline.parse_output(crop_out['out'])[0]
-            crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb), cv2.INTER_LINEAR)
-            out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(np.uint8)
-            temp_out_img_path, out_img_path = get_auto_incremental_file_path(TEMP_DIR, "png"), get_auto_incremental_file_path(OUTPUTS_DIR, "png")
-            save_image(numpy_array=crop_out, output_path=temp_out_img_path)
-            save_image(numpy_array=out, output_path=out_img_path)
-            new_editor_link.append(es)
-            return out
         except Exception as e:
             raise
     def create_video(self,
-                     retargeting_eyes,
-                     retargeting_mouth,
-                     turn_on,
-                     tracking_src_vid,
-                     animate_without_vid,
-                     command,
-                     crop_factor,
-                     src_images=None,
-                     driving_images=None,
-                     motion_link=None,
-                     progress=gr.Progress()):
-        if not turn_on:
-            return None, None
-        src_length = 1
-        if src_images is None:
-            if motion_link is not None:
-                self.psi_list = [motion_link[0]]
-            else:
-                return None, None
-        if src_images is not None:
-            src_length = len(src_images)
-            if id(src_images) != id(self.src_images) or self.crop_factor != crop_factor:
-                self.crop_factor = crop_factor
-                self.src_images = src_images
-                if 1 < src_length:
-                    self.psi_list = self.prepare_source(src_images, crop_factor, True, tracking_src_vid)
-                else:
-                    self.psi_list = [self.prepare_source(src_images, crop_factor)]
-        cmd_list, cmd_length = self.parsing_command(command, motion_link)
-        if cmd_list is None:
-            return None,None
-        cmd_idx = 0
-        driving_length = 0
-        if driving_images is not None:
-            if id(driving_images) != id(self.driving_images):
-                self.driving_images = driving_images
-                self.driving_values = self.prepare_driving_video(driving_images)
-            driving_length = len(self.driving_values)
-        total_length = max(driving_length, src_length)
-        if animate_without_vid:
-            total_length = max(total_length, cmd_length)
-        c_i_es = ExpressionSet()
-        c_o_es = ExpressionSet()
-        d_0_es = None
-        out_list = []
-        psi = None
-        for i in range(total_length):
-            if i < src_length:
-                psi = self.psi_list[i]
-                s_info = psi.x_s_info
-                s_es = ExpressionSet(erst=(s_info['kp'] + s_info['exp'], torch.Tensor([0, 0, 0]), s_info['scale'], s_info['t']))
-            new_es = ExpressionSet(es=s_es)
-            if i < cmd_length:
-                cmd = cmd_list[cmd_idx]
-                if 0 < cmd.change:
-                    cmd.change -= 1
-                    c_i_es.add(cmd.es)
-                    c_i_es.sub(c_o_es)
-                elif 0 < cmd.keep:
-                    cmd.keep -= 1
-                new_es.add(c_i_es)
-                if cmd.change == 0 and cmd.keep == 0:
-                    cmd_idx += 1
-                    if cmd_idx < len(cmd_list):
-                        c_o_es = ExpressionSet(es=c_i_es)
-                        cmd = cmd_list[cmd_idx]
-                        c_o_es.div(cmd.change)
-            elif 0 < cmd_length:
-                new_es.add(c_i_es)
-            if i < driving_length:
-                d_i_info = self.driving_values[i]
-                d_i_r = torch.Tensor([d_i_info['pitch'], d_i_info['yaw'], d_i_info['roll']])#.float().to(device="cuda:0")
-                if d_0_es is None:
-                    d_0_es = ExpressionSet(erst = (d_i_info['exp'], d_i_r, d_i_info['scale'], d_i_info['t']))
-                    self.retargeting(s_es.e, d_0_es.e, retargeting_eyes, (11, 13, 15, 16))
-                    self.retargeting(s_es.e, d_0_es.e, retargeting_mouth, (14, 17, 19, 20))
-                new_es.e += d_i_info['exp'] - d_0_es.e
-                new_es.r += d_i_r - d_0_es.r
-                new_es.t += d_i_info['t'] - d_0_es.t
-            r_new = get_rotation_matrix(
-                s_info['pitch'] + new_es.r[0], s_info['yaw'] + new_es.r[1], s_info['roll'] + new_es.r[2])
-            d_new = new_es.s * (new_es.e @ r_new) + new_es.t
-            d_new = self.pipeline.stitching(psi.x_s_user, d_new)
-            crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, d_new)
-            crop_out = self.pipeline.parse_output(crop_out['out'])[0]
-            crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb),
-                                                cv2.INTER_LINEAR)
-            out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(
-                np.uint8)
-            out_list.append(out)
-            progress(i/total_length, "predicting..")
-        if len(out_list) == 0:
-            return None
-        out_imgs = torch.cat([pil2tensor(img_rgb) for img_rgb in out_list])
-        return out_imgs
     def download_if_no_models(self,
                               model_type: str = ModelType.HUMAN.value,
@@ -528,7 +496,6 @@ class LivePortraitInferencer:
     @staticmethod
     def retargeting(delta_out, driving_exp, factor, idxes):
         for idx in idxes:
-            # delta_out[0, idx] -= src_exp[0, idx] * factor
             delta_out[0, idx] += driving_exp[0, idx] * factor
     @staticmethod
@@ -552,8 +519,15 @@ class LivePortraitInferencer:
         return new_img
     def prepare_src_image(self, img):
-        h, w = img.shape[:2]
-        input_shape = [256,256]
         if h != input_shape[0] or w != input_shape[1]:
             if 256 < h: interpolation = cv2.INTER_AREA
             else: interpolation = cv2.INTER_LINEAR
@@ -624,11 +598,9 @@ class LivePortraitInferencer:
         return psi_list
     def prepare_driving_video(self, face_images):
-        print("Prepare driving video...")
-        f_img_np = (face_images * 255).byte().numpy()
         out_list = []
-        for f_img in f_img_np:
             i_d = self.prepare_src_image(f_img)
             d_info = self.pipeline.get_kp_info(i_d)
             out_list.append(d_info)

 import time
 import copy
 import dill
+import torch
 from ultralytics import YOLO
 import safetensors.torch
 import gradio as gr
 from gradio_i18n import Translate, gettext as _
 from ultralytics.utils import LOGGER as ultralytics_logger
 from enum import Enum
+from typing import Union, List, Dict, Tuple
 from modules.utils.paths import *
 from modules.utils.image_helper import *
+from modules.utils.video_helper import *
 from modules.live_portrait.model_downloader import *
 from modules.live_portrait.live_portrait_wrapper import LivePortraitWrapper
 from modules.utils.camera import get_rotation_matrix
                  model_dir: str = MODELS_DIR,
                  output_dir: str = OUTPUTS_DIR):
         self.model_dir = model_dir
         self.output_dir = output_dir
+        relative_dirs = [
+            os.path.join(self.model_dir, "animal"),
+            os.path.join(self.output_dir, "videos"),
+            os.path.join(self.output_dir, "temp"),
+            os.path.join(self.output_dir, "temp", "video_frames"),
+            os.path.join(self.output_dir, "temp", "video_frames", "out"),
+        ]
+        for dir_path in relative_dirs:
+            os.makedirs(dir_path, exist_ok=True)
         self.model_config = load_yaml(MODEL_CONFIG)["model_params"]
         self.appearance_feature_extractor = None
     def edit_expression(self,
                         model_type: str = ModelType.HUMAN.value,
+                        rotate_pitch: float = 0,
+                        rotate_yaw: float = 0,
+                        rotate_roll: float = 0,
+                        blink: float = 0,
+                        eyebrow: float = 0,
+                        wink: float = 0,
+                        pupil_x: float = 0,
+                        pupil_y: float = 0,
+                        aaa: float = 0,
+                        eee: float = 0,
+                        woo: float = 0,
+                        smile: float = 0,
+                        src_ratio: float = 1,
+                        sample_ratio: float = 1,
+                        sample_parts: str = SamplePart.ALL.value,
+                        crop_factor: float = 2.3,
+                        src_image: Optional[str] = None,
+                        sample_image: Optional[str] = None,) -> None:
         if isinstance(model_type, ModelType):
             model_type = model_type.value
         if model_type not in [mode.value for mode in ModelType]:
             )
         try:
+            with torch.autocast(device_type=self.device, enabled=(self.device == "cuda")):
+                rotate_yaw = -rotate_yaw
+                if src_image is not None:
+                    if id(src_image) != id(self.src_image) or self.crop_factor != crop_factor:
+                        self.crop_factor = crop_factor
+                        self.psi = self.prepare_source(src_image, crop_factor)
+                        self.src_image = src_image
+                else:
+                    return None
+                psi = self.psi
+                s_info = psi.x_s_info
+                #delta_new = copy.deepcopy()
+                s_exp = s_info['exp'] * src_ratio
+                s_exp[0, 5] = s_info['exp'][0, 5]
+                s_exp += s_info['kp']
+                es = ExpressionSet()
+                if isinstance(sample_image, np.ndarray) and sample_image:
+                    if id(self.sample_image) != id(sample_image):
+                        self.sample_image = sample_image
+                        d_image_np = (sample_image * 255).byte().numpy()
+                        d_face = self.crop_face(d_image_np[0], 1.7)
+                        i_d = self.prepare_src_image(d_face)
+                        self.d_info = self.pipeline.get_kp_info(i_d)
+                        self.d_info['exp'][0, 5, 0] = 0
+                        self.d_info['exp'][0, 5, 1] = 0
+                    # "OnlyExpression", "OnlyRotation", "OnlyMouth", "OnlyEyes", "All"
+                    if sample_parts == SamplePart.ONLY_EXPRESSION.value or sample_parts == SamplePart.ONLY_EXPRESSION.ALL.value:
+                        es.e += self.d_info['exp'] * sample_ratio
+                    if sample_parts == SamplePart.ONLY_ROTATION.value or sample_parts == SamplePart.ONLY_ROTATION.ALL.value:
+                        rotate_pitch += self.d_info['pitch'] * sample_ratio
+                        rotate_yaw += self.d_info['yaw'] * sample_ratio
+                        rotate_roll += self.d_info['roll'] * sample_ratio
+                    elif sample_parts == SamplePart.ONLY_MOUTH.value:
+                        self.retargeting(es.e, self.d_info['exp'], sample_ratio, (14, 17, 19, 20))
+                    elif sample_parts == SamplePart.ONLY_EYES.value:
+                        self.retargeting(es.e, self.d_info['exp'], sample_ratio, (1, 2, 11, 13, 15, 16))
+                es.r = self.calc_fe(es.e, blink, eyebrow, wink, pupil_x, pupil_y, aaa, eee, woo, smile,
+                                    rotate_pitch, rotate_yaw, rotate_roll)
+                new_rotate = get_rotation_matrix(s_info['pitch'] + es.r[0], s_info['yaw'] + es.r[1],
+                                                 s_info['roll'] + es.r[2])
+                x_d_new = (s_info['scale'] * (1 + es.s)) * ((s_exp + es.e) @ new_rotate) + s_info['t']
+                x_d_new = self.pipeline.stitching(psi.x_s_user, x_d_new)
+                crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, x_d_new)
+                crop_out = self.pipeline.parse_output(crop_out['out'])[0]
+                crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb), cv2.INTER_LINEAR)
+                out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(np.uint8)
+                temp_out_img_path, out_img_path = get_auto_incremental_file_path(TEMP_DIR, "png"), get_auto_incremental_file_path(OUTPUTS_DIR, "png")
+                save_image(numpy_array=crop_out, output_path=temp_out_img_path)
+                save_image(numpy_array=out, output_path=out_img_path)
+                return out
         except Exception as e:
             raise
     def create_video(self,
+                     model_type: str = ModelType.HUMAN.value,
+                     retargeting_eyes: float = 1,
+                     retargeting_mouth: float = 1,
+                     crop_factor: float = 2.3,
+                     src_image: Optional[str] = None,
+                     driving_vid_path: Optional[str] = None,
+                     progress: gr.Progress = gr.Progress()
+                     ):
+        if self.pipeline is None or model_type != self.model_type:
+            self.load_models(
+                model_type=model_type
+            )
+        try:
+            vid_info = get_video_info(vid_input=driving_vid_path)
+            if src_image is not None:
+                if id(src_image) != id(self.src_image) or self.crop_factor != crop_factor:
+                    self.crop_factor = crop_factor
+                    self.src_image = src_image
+                    self.psi_list = [self.prepare_source(src_image, crop_factor)]
+            progress(0, desc="Extracting frames from the video..")
+            driving_images, vid_sound = extract_frames(driving_vid_path, os.path.join(self.output_dir, "temp", "video_frames")), extract_sound(driving_vid_path)
+            driving_length = 0
+            if driving_images is not None:
+                if id(driving_images) != id(self.driving_images):
+                    self.driving_images = driving_images
+                    self.driving_values = self.prepare_driving_video(driving_images)
+                driving_length = len(self.driving_values)
+            total_length = len(driving_images)
+            c_i_es = ExpressionSet()
+            c_o_es = ExpressionSet()
+            d_0_es = None
+            psi = None
+            with torch.autocast(device_type=self.device, enabled=(self.device == "cuda")):
+                for i in range(total_length):
+                    if i == 0:
+                        psi = self.psi_list[i]
+                        s_info = psi.x_s_info
+                        s_es = ExpressionSet(erst=(s_info['kp'] + s_info['exp'], torch.Tensor([0, 0, 0]), s_info['scale'], s_info['t']))
+                    new_es = ExpressionSet(es=s_es)
+                    if i < driving_length:
+                        d_i_info = self.driving_values[i]
+                        d_i_r = torch.Tensor([d_i_info['pitch'], d_i_info['yaw'], d_i_info['roll']]) # .float().to(device="cuda:0")
+                        if d_0_es is None:
+                            d_0_es = ExpressionSet(erst = (d_i_info['exp'], d_i_r, d_i_info['scale'], d_i_info['t']))
+                            self.retargeting(s_es.e, d_0_es.e, retargeting_eyes, (11, 13, 15, 16))
+                            self.retargeting(s_es.e, d_0_es.e, retargeting_mouth, (14, 17, 19, 20))
+                        new_es.e += d_i_info['exp'] - d_0_es.e
+                        new_es.r += d_i_r - d_0_es.r
+                        new_es.t += d_i_info['t'] - d_0_es.t
+                    r_new = get_rotation_matrix(
+                        s_info['pitch'] + new_es.r[0], s_info['yaw'] + new_es.r[1], s_info['roll'] + new_es.r[2])
+                    d_new = new_es.s * (new_es.e @ r_new) + new_es.t
+                    d_new = self.pipeline.stitching(psi.x_s_user, d_new)
+                    crop_out = self.pipeline.warp_decode(psi.f_s_user, psi.x_s_user, d_new)
+                    crop_out = self.pipeline.parse_output(crop_out['out'])[0]
+                    crop_with_fullsize = cv2.warpAffine(crop_out, psi.crop_trans_m, get_rgb_size(psi.src_rgb),
+                                                        cv2.INTER_LINEAR)
+                    out = np.clip(psi.mask_ori * crop_with_fullsize + (1 - psi.mask_ori) * psi.src_rgb, 0, 255).astype(
+                        np.uint8)
+                    out_frame_path = get_auto_incremental_file_path(os.path.join(self.output_dir, "temp", "video_frames", "out"), "png")
+                    save_image(out, out_frame_path)
+                    progress(i/total_length, desc=f"Generating frames {i}/{total_length} ..")
+                video_path = create_video_from_frames(TEMP_VIDEO_OUT_FRAMES_DIR, frame_rate=vid_info.frame_rate, output_dir=os.path.join(self.output_dir, "videos"))
+                return video_path
+        except Exception as e:
+            raise
     def download_if_no_models(self,
                               model_type: str = ModelType.HUMAN.value,
     @staticmethod
     def retargeting(delta_out, driving_exp, factor, idxes):
         for idx in idxes:
             delta_out[0, idx] += driving_exp[0, idx] * factor
     @staticmethod
         return new_img
     def prepare_src_image(self, img):
+        if isinstance(img, str):
+            img = image_path_to_array(img)
+        if len(img.shape) <= 3:
+            img = img[np.newaxis, ...]
+        d, h, w, c = img.shape
+        img = img[0] # Select first dimension
+        input_shape = [256, 256]
         if h != input_shape[0] or w != input_shape[1]:
             if 256 < h: interpolation = cv2.INTER_AREA
             else: interpolation = cv2.INTER_LINEAR
         return psi_list
     def prepare_driving_video(self, face_images):
+        # print("Prepare driving video...")
         out_list = []
+        for f_img in face_images:
             i_d = self.prepare_src_image(f_img)
             d_info = self.pipeline.get_kp_info(i_d)
             out_list.append(d_info)

modules/utils/constants.py CHANGED Viewed

@@ -31,4 +31,10 @@ GRADIO_CSS = """
 #blink_slider .md.svelte-7ddecg.chatbot.prose {
     font-size: 0.7em;
 }
-"""

 #blink_slider .md.svelte-7ddecg.chatbot.prose {
     font-size: 0.7em;
 }
+"""
+SOUND_FILE_EXT = ['.mp3', '.wav', '.aac', '.flac', '.ogg', '.m4a', '.wma']
+IMAGE_FILE_EXT = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']
+VIDEO_FILE_EXT = ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.mpeg', '.mpg', '.m4v', '.3gp', '.ts', '.vob', '.gif']
+TRANSPARENT_VIDEO_FILE_EXT = ['.webm', '.mov', '.gif']
+SUPPORTED_VIDEO_FILE_EXT = ['.mp4', '.mov', '.webm', '.gif']

modules/utils/image_helper.py CHANGED Viewed

@@ -56,6 +56,7 @@ def calc_crop_limit(center, img_size, crop_size):
 def save_image(numpy_array: np.ndarray, output_path: str):
     out = Image.fromarray(numpy_array)
     out.save(output_path, compress_level=1, format="png")
 def image_path_to_array(image_path: str) -> np.ndarray:

 def save_image(numpy_array: np.ndarray, output_path: str):
     out = Image.fromarray(numpy_array)
     out.save(output_path, compress_level=1, format="png")
+    return output_path
 def image_path_to_array(image_path: str) -> np.ndarray:

modules/utils/paths.py CHANGED Viewed

@@ -6,7 +6,10 @@ PROJECT_ROOT_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), ".."
 MODELS_DIR = os.path.join(PROJECT_ROOT_DIR, "models")
 MODELS_ANIMAL_DIR = os.path.join(MODELS_DIR, "animal")
 OUTPUTS_DIR = os.path.join(PROJECT_ROOT_DIR, "outputs")
 TEMP_DIR = os.path.join(OUTPUTS_DIR, "temp")
 EXP_OUTPUT_DIR = os.path.join(OUTPUTS_DIR, "exp_data")
 MODEL_CONFIG = os.path.join(PROJECT_ROOT_DIR, "modules", "config", "models.yaml")
 MODEL_PATHS = {
@@ -31,7 +34,7 @@ I18N_YAML_PATH = os.path.join(PROJECT_ROOT_DIR, "i18n", "translation.yaml")
 def get_auto_incremental_file_path(dir_path: str, extension: str, prefix: str = ""):
-    counter = 0
     while True:
         if prefix:
             filename = f"{prefix}_{counter:05d}.{extension}"
@@ -39,6 +42,7 @@ def get_auto_incremental_file_path(dir_path: str, extension: str, prefix: str =
             filename = f"{counter:05d}.{extension}"
         full_path = os.path.join(dir_path, filename)
         if not os.path.exists(full_path):
             return full_path
         counter += 1
@@ -50,7 +54,10 @@ def init_dirs():
         MODELS_ANIMAL_DIR,
         OUTPUTS_DIR,
         EXP_OUTPUT_DIR,
-        TEMP_DIR
     ]:
         os.makedirs(dir_path, exist_ok=True)

 MODELS_DIR = os.path.join(PROJECT_ROOT_DIR, "models")
 MODELS_ANIMAL_DIR = os.path.join(MODELS_DIR, "animal")
 OUTPUTS_DIR = os.path.join(PROJECT_ROOT_DIR, "outputs")
+OUTPUTS_VIDEOS_DIR = os.path.join(OUTPUTS_DIR, "videos")
 TEMP_DIR = os.path.join(OUTPUTS_DIR, "temp")
+TEMP_VIDEO_FRAMES_DIR = os.path.join(TEMP_DIR, "video_frames")
+TEMP_VIDEO_OUT_FRAMES_DIR = os.path.join(TEMP_VIDEO_FRAMES_DIR, "out")
 EXP_OUTPUT_DIR = os.path.join(OUTPUTS_DIR, "exp_data")
 MODEL_CONFIG = os.path.join(PROJECT_ROOT_DIR, "modules", "config", "models.yaml")
 MODEL_PATHS = {
 def get_auto_incremental_file_path(dir_path: str, extension: str, prefix: str = ""):
+    counter = len(os.listdir(dir_path))
     while True:
         if prefix:
             filename = f"{prefix}_{counter:05d}.{extension}"
             filename = f"{counter:05d}.{extension}"
         full_path = os.path.join(dir_path, filename)
         if not os.path.exists(full_path):
+            full_path = os.path.normpath(full_path)
             return full_path
         counter += 1
         MODELS_ANIMAL_DIR,
         OUTPUTS_DIR,
         EXP_OUTPUT_DIR,
+        TEMP_DIR,
+        TEMP_VIDEO_FRAMES_DIR,
+        TEMP_VIDEO_OUT_FRAMES_DIR,
+        OUTPUTS_VIDEOS_DIR
     ]:
         os.makedirs(dir_path, exist_ok=True)

modules/utils/video_helper.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import subprocess
+import os
+from typing import List, Optional, Union
+import cv2
+from PIL import Image
+import numpy as np
+from dataclasses import dataclass
+import re
+from pathlib import Path
+from modules.utils.constants import SOUND_FILE_EXT, VIDEO_FILE_EXT, IMAGE_FILE_EXT
+from modules.utils.paths import (TEMP_VIDEO_FRAMES_DIR, TEMP_VIDEO_OUT_FRAMES_DIR, OUTPUTS_VIDEOS_DIR,
+                                 get_auto_incremental_file_path)
+@dataclass
+class VideoInfo:
+    num_frames: Optional[int] = None
+    frame_rate: Optional[int] = None
+    duration: Optional[float] = None
+    has_sound: Optional[bool] = None
+    codec: Optional[str] = None
+def extract_frames(
+    vid_input: str,
+    output_temp_dir: str = TEMP_VIDEO_FRAMES_DIR,
+    start_number: int = 0,
+    clean=True
+):
+    """
+    Extract frames as jpg files and save them into output_temp_dir. This needs FFmpeg installed.
+    """
+    if clean:
+        clean_temp_dir(temp_dir=output_temp_dir)
+    os.makedirs(output_temp_dir, exist_ok=True)
+    output_path = os.path.join(output_temp_dir, "%05d.jpg")
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y',  # Enable overwriting
+        '-i', vid_input,
+        '-qscale:v', '2',
+        '-vf', f'scale=iw:ih',
+        '-start_number', str(start_number),
+        f'{output_path}'
+    ]
+    try:
+        subprocess.run(command, check=True)
+        print(f"Video frames extracted to \"{os.path.normpath(output_temp_dir)}\"")
+    except subprocess.CalledProcessError as e:
+        print("Error occurred while extracting frames from the video")
+        raise RuntimeError(f"An error occurred: {str(e)}")
+    return get_frames_from_dir(output_temp_dir)
+def extract_sound(
+    vid_input: str,
+    output_temp_dir: str = TEMP_VIDEO_FRAMES_DIR,
+):
+    """
+    Extract audio from a video file and save it as a separate sound file. This needs FFmpeg installed.
+    """
+    if Path(vid_input).suffix == ".gif":
+        print("Sound extracting process has passed because gif has no sound")
+        return None
+    os.makedirs(output_temp_dir, exist_ok=True)
+    output_path = os.path.join(output_temp_dir, "sound.mp3")
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y',  # Enable overwriting
+        '-i', vid_input,
+        '-vn',
+        output_path
+    ]
+    try:
+        subprocess.run(command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Warning: Failed to extract sound from the video: {e}")
+    return output_path
+def get_video_info(vid_input: str) -> VideoInfo:
+    """
+    Extract video information using ffmpeg.
+    """
+    command = [
+        'ffmpeg',
+        '-i', vid_input,
+        '-map', '0:v:0',
+        '-c', 'copy',
+        '-f', 'null',
+        '-'
+    ]
+    try:
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                encoding='utf-8', errors='replace', check=True)
+        output = result.stderr
+        num_frames = None
+        frame_rate = None
+        duration = None
+        has_sound = False
+        codec = None
+        for line in output.splitlines():
+            if 'Stream #0:0' in line and 'Video:' in line:
+                fps_match = re.search(r'(\d+(?:\.\d+)?) fps', line)
+                if fps_match:
+                    frame_rate = float(fps_match.group(1))
+                codec_match = re.search(r'Video: (\w+)', line)
+                if codec_match:
+                    codec = codec_match.group(1)
+            elif 'Duration:' in line:
+                duration_match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', line)
+                if duration_match:
+                    h, m, s = map(float, duration_match.groups())
+                    duration = h * 3600 + m * 60 + s
+            elif 'Stream' in line and 'Audio:' in line:
+                has_sound = True
+        if frame_rate and duration:
+            num_frames = int(frame_rate * duration)
+        print(f"Video info - frame_rate: {frame_rate}, duration: {duration}, total frames: {num_frames}")
+        return VideoInfo(
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            duration=duration,
+            has_sound=has_sound,
+            codec=codec
+        )
+    except subprocess.CalledProcessError as e:
+        print("Error occurred while getting info from the video")
+        return VideoInfo()
+def create_video_from_frames(
+    frames_dir: str,
+    frame_rate: Optional[int] = None,
+    sound_path: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    output_mime_type: Optional[str] = None,
+):
+    """
+    Create a video from frames and save it to the output_path. This needs FFmpeg installed.
+    """
+    if not os.path.exists(frames_dir):
+        raise "frames_dir does not exist"
+    frames_dir = os.path.normpath(frames_dir)
+    if output_dir is None:
+        output_dir = OUTPUTS_VIDEOS_DIR
+    os.makedirs(output_dir, exist_ok=True)
+    frame_img_mime_type = ".png"
+    pix_format = "yuv420p"
+    vid_codec, audio_codec = "libx264", "aac"
+    if output_mime_type is None:
+        output_mime_type = ".mp4"
+    output_mime_type = output_mime_type.lower()
+    if output_mime_type == ".mov":
+        pix_format = "yuva444p10le"
+        vid_codec, audio_codec = "prores_ks", "aac"
+    elif output_mime_type == ".webm":
+        pix_format = "yuva420p"
+        vid_codec, audio_codec = "libvpx-vp9", "libvorbis"
+    elif output_mime_type == ".gif":
+        pix_format = None
+        vid_codec, audio_codec = "gif", None
+    output_path = get_auto_incremental_file_path(output_dir, output_mime_type.replace(".", ""))
+    if sound_path is None:
+        temp_sound = os.path.normpath(os.path.join(TEMP_VIDEO_FRAMES_DIR, "sound.mp3"))
+        if os.path.exists(temp_sound):
+            sound_path = temp_sound
+    if frame_rate is None:
+        frame_rate = 25  # Default frame rate for ffmpeg
+    command = [
+        'ffmpeg',
+        '-loglevel', 'error',
+        '-y',
+        '-framerate', str(frame_rate),
+        '-i', os.path.join(frames_dir, f"%05d{frame_img_mime_type}"),
+        '-c:v', vid_codec,
+        '-vf', 'crop=trunc(iw/2)*2:trunc(ih/2)*2' if pix_format else None,
+    ]
+    if output_mime_type == ".gif":
+        command += [
+            "-filter_complex", "[0:v] palettegen=reserve_transparent=on [p]; [0:v][p] paletteuse",
+            "-loop", "0"
+        ]
+    else:
+        command += [
+            '-pix_fmt', pix_format
+        ]
+    command += [output_path]
+    if output_mime_type != ".gif" and sound_path is not None:
+        command += [
+            '-i', sound_path,
+            '-c:a', audio_codec,
+            '-strict', 'experimental',
+            '-b:a', '192k',
+            '-shortest'
+        ]
+    try:
+        subprocess.run(command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred while creating video from frames")
+        raise
+    return output_path
+def create_video_from_numpy_list(frame_list: List[np.ndarray],
+                                 frame_rate: Optional[int] = None,
+                                 sound_path: Optional[str] = None,
+                                 output_dir: Optional[str] = None
+                                 ):
+    if output_dir is None:
+        output_dir = OUTPUTS_VIDEOS_DIR
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = get_auto_incremental_file_path(output_dir, "mp4")
+    if frame_rate is None:
+        frame_rate = 25
+    if sound_path is None:
+        temp_sound = os.path.join(TEMP_VIDEO_FRAMES_DIR, "sound.mp3")
+        if os.path.exists(temp_sound):
+            sound_path = temp_sound
+    height, width, layers = frame_list[0].shape
+    fourcc = cv2.VideoWriter.fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (width, height))
+    for frame in frame_list:
+        out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+    out.release()
+def get_frames_from_dir(vid_dir: str,
+                        available_extensions: Optional[Union[List, str]] = None,
+                        as_numpy: bool = False) -> List:
+    """Get image file paths list from the dir"""
+    if available_extensions is None:
+        available_extensions = [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    if isinstance(available_extensions, str):
+        available_extensions = [available_extensions]
+    frame_names = [
+        p for p in os.listdir(vid_dir)
+        if os.path.splitext(p)[-1] in available_extensions
+    ]
+    if not frame_names:
+        return []
+    frame_names.sort(key=lambda x: int(os.path.splitext(x)[0]))
+    frames = [os.path.join(vid_dir, name) for name in frame_names]
+    if as_numpy:
+        frames = [np.array(Image.open(frame)) for frame in frames]
+    return frames
+def clean_temp_dir(temp_dir: Optional[str] = None):
+    """Removes media files from the video frames directory."""
+    if temp_dir is None:
+        temp_dir = TEMP_VIDEO_FRAMES_DIR
+        temp_out_dir = TEMP_VIDEO_OUT_FRAMES_DIR
+    else:
+        temp_out_dir = os.path.join(temp_dir, "out")
+    clean_files_with_extension(temp_dir, SOUND_FILE_EXT)
+    clean_files_with_extension(temp_dir, IMAGE_FILE_EXT)
+    if os.path.exists(temp_out_dir):
+        clean_files_with_extension(temp_out_dir, IMAGE_FILE_EXT)
+def clean_files_with_extension(dir_path: str, extensions: List):
+    """Remove files with the given extensions from the directory."""
+    for filename in os.listdir(dir_path):
+        if filename.lower().endswith(tuple(extensions)):
+            file_path = os.path.join(dir_path, filename)
+            try:
+                os.remove(file_path)
+            except Exception as e:
+                print("Error while removing image files")

requirements.txt CHANGED Viewed

@@ -13,4 +13,10 @@ ultralytics
 tyro
 dill
 gradio
-gradio-i18n

 tyro
 dill
 gradio
+gradio-i18n
+# Tests
+# pytest
+# scikit-image
+# moviepy

tests/test_config.py CHANGED Viewed

@@ -4,13 +4,18 @@ import os
 import torch
 import functools
 import numpy as np
 from modules.utils.paths import *
 TEST_IMAGE_URL = "https://github.com/microsoft/onnxjs-demo/raw/master/src/assets/EmotionSampleImages/sad_baby.jpg"
-TEST_IMAGE_PATH = os.path.join(PROJECT_ROOT_DIR, "tests", "test.png")
-TEST_EXPRESSION_OUTPUT_PATH = os.path.join(PROJECT_ROOT_DIR, "tests", "edited_expression.png")
 TEST_EXPRESSION_AAA = 100
@@ -40,6 +45,62 @@ def are_images_different(image1_path: str, image2_path: str):
         return True
 @functools.lru_cache
 def is_cuda_available():
     return torch.cuda.is_available()

 import torch
 import functools
 import numpy as np
+import cv2
+from skimage.metrics import structural_similarity as compare_ssim
+from moviepy.editor import VideoFileClip
 from modules.utils.paths import *
 TEST_IMAGE_URL = "https://github.com/microsoft/onnxjs-demo/raw/master/src/assets/EmotionSampleImages/sad_baby.jpg"
+TEST_VIDEO_URL = "https://github.com/jhj0517/sample-medias/raw/master/vids/human-face/expression01_short.mp4"
+TEST_IMAGE_PATH = os.path.normpath(os.path.join(PROJECT_ROOT_DIR, "tests", "test.png"))
+TEST_VIDEO_PATH = os.path.normpath(os.path.join(PROJECT_ROOT_DIR, "tests", "test_expression.mp4"))
+TEST_EXPRESSION_OUTPUT_PATH = os.path.normpath(os.path.join(PROJECT_ROOT_DIR, "tests", "edited_expression.png"))
 TEST_EXPRESSION_AAA = 100
         return True
+def are_videos_different(video1_path: str, video2_path: str):
+    cap1 = cv2.VideoCapture(video1_path)
+    cap2 = cv2.VideoCapture(video2_path)
+    while True:
+        ret1, frame1 = cap1.read()
+        ret2, frame2 = cap2.read()
+        if not ret1 or not ret2:
+            if ret1 != ret2:
+                return True
+            break
+        if frame1.shape != frame2.shape:
+            frame1 = cv2.resize(frame1, (frame2.shape[1], frame2.shape[0]))
+        score, _ = compare_ssim(frame1, frame2, full=True, multichannel=True)
+        if score < 0.99:
+            return True
+    cap1.release()
+    cap2.release()
+    return False
+def validate_video(video_path):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print("Could not open video file.")
+        return False
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_count += 1
+    cap.release()
+    if frame_count == 0:
+        print("No frames found in video file.")
+        return False
+    return True
+def has_sound(video_path: str):
+    try:
+        video = VideoFileClip(video_path)
+        return video.audio is not None
+    except Exception as e:
+        return False
 @functools.lru_cache
 def is_cuda_available():
     return torch.cuda.is_available()

tests/test_video_creation.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import pytest
+from test_config import *
+from modules.live_portrait.live_portrait_inferencer import LivePortraitInferencer
+from modules.utils.image_helper import save_image
+@pytest.mark.parametrize(
+    "input_image,expression_video",
+    [
+        (TEST_IMAGE_PATH, TEST_VIDEO_PATH),
+    ]
+)
+def test_video_creation(
+    input_image: str,
+    expression_video: str
+):
+    if not os.path.exists(TEST_IMAGE_PATH):
+        download_image(
+            TEST_IMAGE_URL,
+            TEST_IMAGE_PATH
+        )
+    if not os.path.exists(TEST_VIDEO_PATH):
+        download_image(
+            TEST_VIDEO_URL,
+            TEST_VIDEO_PATH
+        )
+    inferencer = LivePortraitInferencer()
+    output_video_path = inferencer.create_video(
+        driving_vid_path=expression_video,
+        src_image=input_image,
+    )
+    assert os.path.exists(output_video_path)
+    assert validate_video(output_video_path)
+    assert has_sound(output_video_path)