Flux9665 commited on
Commit
855aa9e
Β·
verified Β·
1 Parent(s): dfb452f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -80,7 +80,7 @@ from accelerate.utils import set_seed
80
  from latentsync.whisper.audio2feature import Audio2Feature
81
 
82
 
83
- @spaces.GPU(duration=800)
84
  def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
85
  """
86
  Perform lip-sync video generation using an input video and a separate audio track.
@@ -106,20 +106,22 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
106
  print(f"Input audio path: {audio_path}")
107
  print(f"Loaded checkpoint path: {inference_ckpt_path}")
108
 
109
- is_shared_ui = True if "fffiloni/LatentSync" in os.environ['SPACE_ID'] else False
110
  temp_dir = None
111
  if is_shared_ui:
112
  temp_dir = tempfile.mkdtemp()
 
113
  cropped_video_path = process_video(video_path)
114
  print(f"Cropped video saved to: {cropped_video_path}")
115
  video_path=cropped_video_path
116
-
117
  trimmed_audio_path = process_audio(audio_path, temp_dir)
118
  print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
119
  audio_path=trimmed_audio_path
120
 
 
121
  scheduler = DDIMScheduler.from_pretrained("configs")
122
-
123
  if config.model.cross_attention_dim == 768:
124
  whisper_model_path = "checkpoints/whisper/small.pt"
125
  elif config.model.cross_attention_dim == 384:
@@ -128,8 +130,9 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
128
  raise NotImplementedError("cross_attention_dim must be 768 or 384")
129
 
130
  audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
131
-
132
  vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
 
133
  vae.config.scaling_factor = 0.18215
134
  vae.config.shift_factor = 0
135
 
@@ -138,7 +141,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
138
  inference_ckpt_path, # load checkpoint
139
  device="cpu",
140
  )
141
-
142
  unet = unet.to(dtype=torch.float16)
143
 
144
  """
@@ -154,7 +157,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
154
  unet=unet,
155
  scheduler=scheduler,
156
  ).to("cuda")
157
-
158
  seed = -1
159
  if seed != -1:
160
  set_seed(seed)
@@ -165,7 +168,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
165
 
166
  unique_id = str(uuid.uuid4())
167
  video_out_path = f"video_out{unique_id}.mp4"
168
-
169
  pipeline(
170
  video_path=video_path,
171
  audio_path=audio_path,
@@ -178,7 +181,7 @@ def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
178
  width=config.data.resolution,
179
  height=config.data.resolution,
180
  )
181
-
182
  if is_shared_ui:
183
  # Clean up the temporary directory
184
  if os.path.exists(temp_dir):
 
80
  from latentsync.whisper.audio2feature import Audio2Feature
81
 
82
 
83
+ @spaces.GPU(duration=40)
84
  def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
85
  """
86
  Perform lip-sync video generation using an input video and a separate audio track.
 
106
  print(f"Input audio path: {audio_path}")
107
  print(f"Loaded checkpoint path: {inference_ckpt_path}")
108
 
109
+ is_shared_ui = True
110
  temp_dir = None
111
  if is_shared_ui:
112
  temp_dir = tempfile.mkdtemp()
113
+ print(1)
114
  cropped_video_path = process_video(video_path)
115
  print(f"Cropped video saved to: {cropped_video_path}")
116
  video_path=cropped_video_path
117
+
118
  trimmed_audio_path = process_audio(audio_path, temp_dir)
119
  print(f"Processed file was stored temporarily at: {trimmed_audio_path}")
120
  audio_path=trimmed_audio_path
121
 
122
+ print(2)
123
  scheduler = DDIMScheduler.from_pretrained("configs")
124
+ print(3)
125
  if config.model.cross_attention_dim == 768:
126
  whisper_model_path = "checkpoints/whisper/small.pt"
127
  elif config.model.cross_attention_dim == 384:
 
130
  raise NotImplementedError("cross_attention_dim must be 768 or 384")
131
 
132
  audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
133
+ print(4)
134
  vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
135
+ print(5)
136
  vae.config.scaling_factor = 0.18215
137
  vae.config.shift_factor = 0
138
 
 
141
  inference_ckpt_path, # load checkpoint
142
  device="cpu",
143
  )
144
+ print(6)
145
  unet = unet.to(dtype=torch.float16)
146
 
147
  """
 
157
  unet=unet,
158
  scheduler=scheduler,
159
  ).to("cuda")
160
+ print(7)
161
  seed = -1
162
  if seed != -1:
163
  set_seed(seed)
 
168
 
169
  unique_id = str(uuid.uuid4())
170
  video_out_path = f"video_out{unique_id}.mp4"
171
+
172
  pipeline(
173
  video_path=video_path,
174
  audio_path=audio_path,
 
181
  width=config.data.resolution,
182
  height=config.data.resolution,
183
  )
184
+ print(8)
185
  if is_shared_ui:
186
  # Clean up the temporary directory
187
  if os.path.exists(temp_dir):