Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- README.md.txt +5 -0
- app.py +44 -0
- inference.py +158 -0
- requirements.txt +12 -0
README.md.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Faceless Video Generator (Free)
|
2 |
+
This tool lets you upload a photo + text, and get an AI-generated talking video.
|
3 |
+
It uses SadTalker, Coqui TTS, and Whisper to build everything locally in one step.
|
4 |
+
Built with Gradio + Hugging Face Spaces.
|
5 |
+
|
app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import snapshot_download
|
2 |
+
import gradio as gr
|
3 |
+
import subprocess
|
4 |
+
import os
|
5 |
+
import uuid
|
6 |
+
|
7 |
+
# Download SadTalker model from Hugging Face (one-time setup)
|
8 |
+
def setup_models():
|
9 |
+
if not os.path.exists("checkpoints"):
|
10 |
+
snapshot_download(repo_id="OpenTalker/SadTalker", local_dir="checkpoints")
|
11 |
+
|
12 |
+
setup_models()
|
13 |
+
|
14 |
+
# Main function to generate video
|
15 |
+
def generate(text, image):
|
16 |
+
session = str(uuid.uuid4())[:8]
|
17 |
+
os.makedirs(f"results/{session}", exist_ok=True)
|
18 |
+
|
19 |
+
# Save uploaded image
|
20 |
+
image_path = f"results/{session}/avatar.jpg"
|
21 |
+
image.save(image_path)
|
22 |
+
|
23 |
+
# Generate audio using Coqui TTS
|
24 |
+
audio_path = f"results/{session}/audio.wav"
|
25 |
+
tts_cmd = f'tts --text "{text}" --out_path {audio_path}'
|
26 |
+
subprocess.run(tts_cmd, shell=True)
|
27 |
+
|
28 |
+
# Run SadTalker
|
29 |
+
video_cmd = f'python inference.py --driven_audio {audio_path} --source_image {image_path} --result_dir results/{session}'
|
30 |
+
subprocess.run(video_cmd, shell=True)
|
31 |
+
|
32 |
+
return f"results/{session}/video.mp4"
|
33 |
+
|
34 |
+
# Gradio interface
|
35 |
+
gr.Interface(
|
36 |
+
fn=generate,
|
37 |
+
inputs=[
|
38 |
+
gr.Textbox(label="📝 Script", placeholder="Enter your script here..."),
|
39 |
+
gr.Image(label="🖼️ Avatar Image", type="pil")
|
40 |
+
],
|
41 |
+
outputs=gr.Video(label="🎬 Generated Video"),
|
42 |
+
title="🆓 Faceless Video Generator",
|
43 |
+
description="Upload a face photo + script, and get a talking avatar video powered by SadTalker + Coqui TTS."
|
44 |
+
).launch()
|
inference.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from time import strftime
|
3 |
+
import os, sys, time
|
4 |
+
from argparse import ArgumentParser
|
5 |
+
|
6 |
+
from src.utils.preprocess import CropAndExtract
|
7 |
+
from src.test_audio2coeff import Audio2Coeff
|
8 |
+
from src.facerender.animate import AnimateFromCoeff
|
9 |
+
from src.generate_batch import get_data
|
10 |
+
from src.generate_facerender_batch import get_facerender_data
|
11 |
+
|
12 |
+
def main(args):
|
13 |
+
#torch.backends.cudnn.enabled = False
|
14 |
+
|
15 |
+
pic_path = args.source_image
|
16 |
+
audio_path = args.driven_audio
|
17 |
+
save_dir = os.path.join(args.result_dir, strftime("%Y_%m_%d_%H.%M.%S"))
|
18 |
+
os.makedirs(save_dir, exist_ok=True)
|
19 |
+
pose_style = args.pose_style
|
20 |
+
device = args.device
|
21 |
+
batch_size = args.batch_size
|
22 |
+
input_yaw_list = args.input_yaw
|
23 |
+
input_pitch_list = args.input_pitch
|
24 |
+
input_roll_list = args.input_roll
|
25 |
+
ref_eyeblink = args.ref_eyeblink
|
26 |
+
ref_pose = args.ref_pose
|
27 |
+
|
28 |
+
current_code_path = sys.argv[0]
|
29 |
+
current_root_path = os.path.split(current_code_path)[0]
|
30 |
+
|
31 |
+
os.environ['TORCH_HOME']=os.path.join(current_root_path, args.checkpoint_dir)
|
32 |
+
|
33 |
+
path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
|
34 |
+
path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
|
35 |
+
dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting')
|
36 |
+
wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
|
37 |
+
|
38 |
+
audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
|
39 |
+
audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
|
40 |
+
|
41 |
+
audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
|
42 |
+
audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
|
43 |
+
|
44 |
+
free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
|
45 |
+
|
46 |
+
if args.preprocess == 'full':
|
47 |
+
mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
|
48 |
+
facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
|
49 |
+
else:
|
50 |
+
mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
|
51 |
+
facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
|
52 |
+
|
53 |
+
#init model
|
54 |
+
print(path_of_net_recon_model)
|
55 |
+
preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
|
56 |
+
|
57 |
+
print(audio2pose_checkpoint)
|
58 |
+
print(audio2exp_checkpoint)
|
59 |
+
audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
|
60 |
+
audio2exp_checkpoint, audio2exp_yaml_path,
|
61 |
+
wav2lip_checkpoint, device)
|
62 |
+
|
63 |
+
print(free_view_checkpoint)
|
64 |
+
print(mapping_checkpoint)
|
65 |
+
animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
|
66 |
+
facerender_yaml_path, device)
|
67 |
+
|
68 |
+
#crop image and extract 3dmm from image
|
69 |
+
first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
|
70 |
+
os.makedirs(first_frame_dir, exist_ok=True)
|
71 |
+
print('3DMM Extraction for source image')
|
72 |
+
first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
|
73 |
+
if first_coeff_path is None:
|
74 |
+
print("Can't get the coeffs of the input")
|
75 |
+
return
|
76 |
+
|
77 |
+
if ref_eyeblink is not None:
|
78 |
+
ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
|
79 |
+
ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
|
80 |
+
os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
|
81 |
+
print('3DMM Extraction for the reference video providing eye blinking')
|
82 |
+
ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
|
83 |
+
else:
|
84 |
+
ref_eyeblink_coeff_path=None
|
85 |
+
|
86 |
+
if ref_pose is not None:
|
87 |
+
if ref_pose == ref_eyeblink:
|
88 |
+
ref_pose_coeff_path = ref_eyeblink_coeff_path
|
89 |
+
else:
|
90 |
+
ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
|
91 |
+
ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
|
92 |
+
os.makedirs(ref_pose_frame_dir, exist_ok=True)
|
93 |
+
print('3DMM Extraction for the reference video providing pose')
|
94 |
+
ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
|
95 |
+
else:
|
96 |
+
ref_pose_coeff_path=None
|
97 |
+
|
98 |
+
#audio2ceoff
|
99 |
+
batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
|
100 |
+
coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
|
101 |
+
|
102 |
+
# 3dface render
|
103 |
+
if args.face3dvis:
|
104 |
+
from src.face3d.visualize import gen_composed_video
|
105 |
+
gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
|
106 |
+
|
107 |
+
#coeff2video
|
108 |
+
data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
|
109 |
+
batch_size, input_yaw_list, input_pitch_list, input_roll_list,
|
110 |
+
expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
|
111 |
+
|
112 |
+
animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
|
113 |
+
enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
|
114 |
+
|
115 |
+
if __name__ == '__main__':
|
116 |
+
|
117 |
+
parser = ArgumentParser()
|
118 |
+
parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', help="path to driven audio")
|
119 |
+
parser.add_argument("--source_image", default='./examples/source_image/full_body_2.png', help="path to source image")
|
120 |
+
parser.add_argument("--ref_eyeblink", default=None, help="path to reference video providing eye blinking")
|
121 |
+
parser.add_argument("--ref_pose", default=None, help="path to reference video providing pose")
|
122 |
+
parser.add_argument("--checkpoint_dir", default='./checkpoints', help="path to output")
|
123 |
+
parser.add_argument("--result_dir", default='./results', help="path to output")
|
124 |
+
parser.add_argument("--pose_style", type=int, default=0, help="input pose style from [0, 46)")
|
125 |
+
parser.add_argument("--batch_size", type=int, default=2, help="the batch size of facerender")
|
126 |
+
parser.add_argument("--expression_scale", type=float, default=1., help="the batch size of facerender")
|
127 |
+
parser.add_argument('--input_yaw', nargs='+', type=int, default=None, help="the input yaw degree of the user ")
|
128 |
+
parser.add_argument('--input_pitch', nargs='+', type=int, default=None, help="the input pitch degree of the user")
|
129 |
+
parser.add_argument('--input_roll', nargs='+', type=int, default=None, help="the input roll degree of the user")
|
130 |
+
parser.add_argument('--enhancer', type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]")
|
131 |
+
parser.add_argument('--background_enhancer', type=str, default=None, help="background enhancer, [realesrgan]")
|
132 |
+
parser.add_argument("--cpu", dest="cpu", action="store_true")
|
133 |
+
parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks")
|
134 |
+
parser.add_argument("--still", action="store_true", help="can crop back to the original videos for the full body aniamtion")
|
135 |
+
parser.add_argument("--preprocess", default='crop', choices=['crop', 'resize', 'full'], help="how to preprocess the images" )
|
136 |
+
|
137 |
+
# net structure and parameters
|
138 |
+
parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='useless')
|
139 |
+
parser.add_argument('--init_path', type=str, default=None, help='Useless')
|
140 |
+
parser.add_argument('--use_last_fc',default=False, help='zero initialize the last fc')
|
141 |
+
parser.add_argument('--bfm_folder', type=str, default='./checkpoints/BFM_Fitting/')
|
142 |
+
parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
|
143 |
+
|
144 |
+
# default renderer parameters
|
145 |
+
parser.add_argument('--focal', type=float, default=1015.)
|
146 |
+
parser.add_argument('--center', type=float, default=112.)
|
147 |
+
parser.add_argument('--camera_d', type=float, default=10.)
|
148 |
+
parser.add_argument('--z_near', type=float, default=5.)
|
149 |
+
parser.add_argument('--z_far', type=float, default=15.)
|
150 |
+
|
151 |
+
args = parser.parse_args()
|
152 |
+
|
153 |
+
if torch.cuda.is_available() and not args.cpu:
|
154 |
+
args.device = "cuda"
|
155 |
+
else:
|
156 |
+
args.device = "cpu"
|
157 |
+
|
158 |
+
main(args)
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
torch
|
3 |
+
torchaudio
|
4 |
+
transformers
|
5 |
+
numpy
|
6 |
+
pillow
|
7 |
+
soundfile
|
8 |
+
ffmpeg-python
|
9 |
+
openai-whisper
|
10 |
+
TTS
|
11 |
+
huggingface_hub
|
12 |
+
|