jjjimim commited on
Commit
b5f2882
·
verified ·
1 Parent(s): f65860b

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md.txt +5 -0
  2. app.py +44 -0
  3. inference.py +158 -0
  4. requirements.txt +12 -0
README.md.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Faceless Video Generator (Free)
2
+ This tool lets you upload a photo + text, and get an AI-generated talking video.
3
+ It uses SadTalker, Coqui TTS, and Whisper to build everything locally in one step.
4
+ Built with Gradio + Hugging Face Spaces.
5
+
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download
2
+ import gradio as gr
3
+ import subprocess
4
+ import os
5
+ import uuid
6
+
7
+ # Download SadTalker model from Hugging Face (one-time setup)
8
+ def setup_models():
9
+ if not os.path.exists("checkpoints"):
10
+ snapshot_download(repo_id="OpenTalker/SadTalker", local_dir="checkpoints")
11
+
12
+ setup_models()
13
+
14
+ # Main function to generate video
15
+ def generate(text, image):
16
+ session = str(uuid.uuid4())[:8]
17
+ os.makedirs(f"results/{session}", exist_ok=True)
18
+
19
+ # Save uploaded image
20
+ image_path = f"results/{session}/avatar.jpg"
21
+ image.save(image_path)
22
+
23
+ # Generate audio using Coqui TTS
24
+ audio_path = f"results/{session}/audio.wav"
25
+ tts_cmd = f'tts --text "{text}" --out_path {audio_path}'
26
+ subprocess.run(tts_cmd, shell=True)
27
+
28
+ # Run SadTalker
29
+ video_cmd = f'python inference.py --driven_audio {audio_path} --source_image {image_path} --result_dir results/{session}'
30
+ subprocess.run(video_cmd, shell=True)
31
+
32
+ return f"results/{session}/video.mp4"
33
+
34
+ # Gradio interface
35
+ gr.Interface(
36
+ fn=generate,
37
+ inputs=[
38
+ gr.Textbox(label="📝 Script", placeholder="Enter your script here..."),
39
+ gr.Image(label="🖼️ Avatar Image", type="pil")
40
+ ],
41
+ outputs=gr.Video(label="🎬 Generated Video"),
42
+ title="🆓 Faceless Video Generator",
43
+ description="Upload a face photo + script, and get a talking avatar video powered by SadTalker + Coqui TTS."
44
+ ).launch()
inference.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from time import strftime
3
+ import os, sys, time
4
+ from argparse import ArgumentParser
5
+
6
+ from src.utils.preprocess import CropAndExtract
7
+ from src.test_audio2coeff import Audio2Coeff
8
+ from src.facerender.animate import AnimateFromCoeff
9
+ from src.generate_batch import get_data
10
+ from src.generate_facerender_batch import get_facerender_data
11
+
12
+ def main(args):
13
+ #torch.backends.cudnn.enabled = False
14
+
15
+ pic_path = args.source_image
16
+ audio_path = args.driven_audio
17
+ save_dir = os.path.join(args.result_dir, strftime("%Y_%m_%d_%H.%M.%S"))
18
+ os.makedirs(save_dir, exist_ok=True)
19
+ pose_style = args.pose_style
20
+ device = args.device
21
+ batch_size = args.batch_size
22
+ input_yaw_list = args.input_yaw
23
+ input_pitch_list = args.input_pitch
24
+ input_roll_list = args.input_roll
25
+ ref_eyeblink = args.ref_eyeblink
26
+ ref_pose = args.ref_pose
27
+
28
+ current_code_path = sys.argv[0]
29
+ current_root_path = os.path.split(current_code_path)[0]
30
+
31
+ os.environ['TORCH_HOME']=os.path.join(current_root_path, args.checkpoint_dir)
32
+
33
+ path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
34
+ path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
35
+ dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting')
36
+ wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
37
+
38
+ audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
39
+ audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
40
+
41
+ audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
42
+ audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
43
+
44
+ free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
45
+
46
+ if args.preprocess == 'full':
47
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
48
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
49
+ else:
50
+ mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
51
+ facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
52
+
53
+ #init model
54
+ print(path_of_net_recon_model)
55
+ preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
56
+
57
+ print(audio2pose_checkpoint)
58
+ print(audio2exp_checkpoint)
59
+ audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
60
+ audio2exp_checkpoint, audio2exp_yaml_path,
61
+ wav2lip_checkpoint, device)
62
+
63
+ print(free_view_checkpoint)
64
+ print(mapping_checkpoint)
65
+ animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
66
+ facerender_yaml_path, device)
67
+
68
+ #crop image and extract 3dmm from image
69
+ first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
70
+ os.makedirs(first_frame_dir, exist_ok=True)
71
+ print('3DMM Extraction for source image')
72
+ first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
73
+ if first_coeff_path is None:
74
+ print("Can't get the coeffs of the input")
75
+ return
76
+
77
+ if ref_eyeblink is not None:
78
+ ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
79
+ ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
80
+ os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
81
+ print('3DMM Extraction for the reference video providing eye blinking')
82
+ ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
83
+ else:
84
+ ref_eyeblink_coeff_path=None
85
+
86
+ if ref_pose is not None:
87
+ if ref_pose == ref_eyeblink:
88
+ ref_pose_coeff_path = ref_eyeblink_coeff_path
89
+ else:
90
+ ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
91
+ ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
92
+ os.makedirs(ref_pose_frame_dir, exist_ok=True)
93
+ print('3DMM Extraction for the reference video providing pose')
94
+ ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
95
+ else:
96
+ ref_pose_coeff_path=None
97
+
98
+ #audio2ceoff
99
+ batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
100
+ coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
101
+
102
+ # 3dface render
103
+ if args.face3dvis:
104
+ from src.face3d.visualize import gen_composed_video
105
+ gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
106
+
107
+ #coeff2video
108
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
109
+ batch_size, input_yaw_list, input_pitch_list, input_roll_list,
110
+ expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
111
+
112
+ animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
113
+ enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
114
+
115
+ if __name__ == '__main__':
116
+
117
+ parser = ArgumentParser()
118
+ parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', help="path to driven audio")
119
+ parser.add_argument("--source_image", default='./examples/source_image/full_body_2.png', help="path to source image")
120
+ parser.add_argument("--ref_eyeblink", default=None, help="path to reference video providing eye blinking")
121
+ parser.add_argument("--ref_pose", default=None, help="path to reference video providing pose")
122
+ parser.add_argument("--checkpoint_dir", default='./checkpoints', help="path to output")
123
+ parser.add_argument("--result_dir", default='./results', help="path to output")
124
+ parser.add_argument("--pose_style", type=int, default=0, help="input pose style from [0, 46)")
125
+ parser.add_argument("--batch_size", type=int, default=2, help="the batch size of facerender")
126
+ parser.add_argument("--expression_scale", type=float, default=1., help="the batch size of facerender")
127
+ parser.add_argument('--input_yaw', nargs='+', type=int, default=None, help="the input yaw degree of the user ")
128
+ parser.add_argument('--input_pitch', nargs='+', type=int, default=None, help="the input pitch degree of the user")
129
+ parser.add_argument('--input_roll', nargs='+', type=int, default=None, help="the input roll degree of the user")
130
+ parser.add_argument('--enhancer', type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]")
131
+ parser.add_argument('--background_enhancer', type=str, default=None, help="background enhancer, [realesrgan]")
132
+ parser.add_argument("--cpu", dest="cpu", action="store_true")
133
+ parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks")
134
+ parser.add_argument("--still", action="store_true", help="can crop back to the original videos for the full body aniamtion")
135
+ parser.add_argument("--preprocess", default='crop', choices=['crop', 'resize', 'full'], help="how to preprocess the images" )
136
+
137
+ # net structure and parameters
138
+ parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='useless')
139
+ parser.add_argument('--init_path', type=str, default=None, help='Useless')
140
+ parser.add_argument('--use_last_fc',default=False, help='zero initialize the last fc')
141
+ parser.add_argument('--bfm_folder', type=str, default='./checkpoints/BFM_Fitting/')
142
+ parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
143
+
144
+ # default renderer parameters
145
+ parser.add_argument('--focal', type=float, default=1015.)
146
+ parser.add_argument('--center', type=float, default=112.)
147
+ parser.add_argument('--camera_d', type=float, default=10.)
148
+ parser.add_argument('--z_near', type=float, default=5.)
149
+ parser.add_argument('--z_far', type=float, default=15.)
150
+
151
+ args = parser.parse_args()
152
+
153
+ if torch.cuda.is_available() and not args.cpu:
154
+ args.device = "cuda"
155
+ else:
156
+ args.device = "cpu"
157
+
158
+ main(args)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ torchaudio
4
+ transformers
5
+ numpy
6
+ pillow
7
+ soundfile
8
+ ffmpeg-python
9
+ openai-whisper
10
+ TTS
11
+ huggingface_hub
12
+