Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ import math
|
|
| 7 |
import librosa
|
| 8 |
from PIL import Image, ImageSequence
|
| 9 |
from decord import VideoReader, cpu
|
|
|
|
| 10 |
from transformers import AutoModel, AutoTokenizer, AutoProcessor
|
| 11 |
|
| 12 |
# Variables
|
|
@@ -51,13 +52,15 @@ def uniform_sample(idxs, n):
|
|
| 51 |
return [idxs[int(i * gap + gap / 2)] for i in range(n)]
|
| 52 |
|
| 53 |
def build_omni_chunks(path, sr=16000, seconds_per_unit=1):
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
content = []
|
| 59 |
for i in range(total_units):
|
| 60 |
-
|
|
|
|
| 61 |
audio_chunk = audio_np[sr * i * seconds_per_unit : sr * (i + 1) * seconds_per_unit]
|
| 62 |
content.extend(["<unit>", frame, audio_chunk])
|
| 63 |
return content
|
|
|
|
| 7 |
import librosa
|
| 8 |
from PIL import Image, ImageSequence
|
| 9 |
from decord import VideoReader, cpu
|
| 10 |
+
from moviepy.editor import VideoFileClip
|
| 11 |
from transformers import AutoModel, AutoTokenizer, AutoProcessor
|
| 12 |
|
| 13 |
# Variables
|
|
|
|
| 52 |
return [idxs[int(i * gap + gap / 2)] for i in range(n)]
|
| 53 |
|
| 54 |
def build_omni_chunks(path, sr=16000, seconds_per_unit=1):
|
| 55 |
+
clip = VideoFileClip(path)
|
| 56 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 57 |
+
clip.audio.write_audiofile(tmp.name, fps=sr, codec="pcm_s16le", verbose=False, logger=None)
|
| 58 |
+
audio_np, _ = librosa.load(tmp.name, sr=sr, mono=True)
|
| 59 |
+
total_units = math.ceil(clip.duration / seconds_per_unit)
|
| 60 |
content = []
|
| 61 |
for i in range(total_units):
|
| 62 |
+
t = min(i * seconds_per_unit, clip.duration - 1e-3)
|
| 63 |
+
frame = Image.fromarray(clip.get_frame(t).astype("uint8"))
|
| 64 |
audio_chunk = audio_np[sr * i * seconds_per_unit : sr * (i + 1) * seconds_per_unit]
|
| 65 |
content.extend(["<unit>", frame, audio_chunk])
|
| 66 |
return content
|