Staticaliza commited on
Commit
c750982
·
verified ·
1 Parent(s): 03f6f58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -7,6 +7,7 @@ import math
7
  import librosa
8
  from PIL import Image, ImageSequence
9
  from decord import VideoReader, cpu
 
10
  from transformers import AutoModel, AutoTokenizer, AutoProcessor
11
 
12
  # Variables
@@ -51,13 +52,15 @@ def uniform_sample(idxs, n):
51
  return [idxs[int(i * gap + gap / 2)] for i in range(n)]
52
 
53
  def build_omni_chunks(path, sr=16000, seconds_per_unit=1):
54
- vr = VideoReader(path, ctx=cpu(0))
55
- fps = round(vr.get_avg_fps())
56
- audio_np, _ = librosa.load(path, sr=sr, mono=True)
57
- total_units = math.ceil(len(vr) / fps / seconds_per_unit)
 
58
  content = []
59
  for i in range(total_units):
60
- frame = Image.fromarray(vr[int(i * fps * seconds_per_unit)].asnumpy().astype("uint8"))
 
61
  audio_chunk = audio_np[sr * i * seconds_per_unit : sr * (i + 1) * seconds_per_unit]
62
  content.extend(["<unit>", frame, audio_chunk])
63
  return content
 
7
  import librosa
8
  from PIL import Image, ImageSequence
9
  from decord import VideoReader, cpu
10
+ from moviepy.editor import VideoFileClip
11
  from transformers import AutoModel, AutoTokenizer, AutoProcessor
12
 
13
  # Variables
 
52
  return [idxs[int(i * gap + gap / 2)] for i in range(n)]
53
 
54
  def build_omni_chunks(path, sr=16000, seconds_per_unit=1):
55
+ clip = VideoFileClip(path)
56
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
57
+ clip.audio.write_audiofile(tmp.name, fps=sr, codec="pcm_s16le", verbose=False, logger=None)
58
+ audio_np, _ = librosa.load(tmp.name, sr=sr, mono=True)
59
+ total_units = math.ceil(clip.duration / seconds_per_unit)
60
  content = []
61
  for i in range(total_units):
62
+ t = min(i * seconds_per_unit, clip.duration - 1e-3)
63
+ frame = Image.fromarray(clip.get_frame(t).astype("uint8"))
64
  audio_chunk = audio_np[sr * i * seconds_per_unit : sr * (i + 1) * seconds_per_unit]
65
  content.extend(["<unit>", frame, audio_chunk])
66
  return content