Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -32,10 +32,10 @@ footer {
|
|
32 |
'''
|
33 |
|
34 |
input_prefixes = {
|
35 |
-
"Image": "(A image file called β has been attached) ",
|
36 |
-
"GIF": "(A GIF file called β has been attached) ",
|
37 |
-
"Video": "(A video with audio file called β has been attached) ",
|
38 |
-
"Audio": "(A audio file called β has been attached) ",
|
39 |
}
|
40 |
|
41 |
filetypes = {
|
@@ -49,6 +49,18 @@ def uniform_sample(idxs, n):
|
|
49 |
gap = len(idxs) / n
|
50 |
return [idxs[int(i * gap + gap / 2)] for i in range(n)]
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
def encode_video(path):
|
53 |
vr = VideoReader(path, ctx=cpu(0))
|
54 |
fps = round(vr.get_avg_fps())
|
@@ -88,21 +100,11 @@ def generate(input, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7,
|
|
88 |
frames = encode_gif(input)
|
89 |
content.extend(frames)
|
90 |
elif filetype == "Video":
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
audio_np, sample_rate = librosa.load(input, sr=16000, mono=True)
|
97 |
-
for idx, frame_np in zip(idxs, frames_np):
|
98 |
-
image = Image.fromarray(frame_np.astype("uint8")).convert("RGB")
|
99 |
-
content.append(image)
|
100 |
-
sec = idx // fps
|
101 |
-
start = sec * sample_rate
|
102 |
-
end = start + sample_rate
|
103 |
-
chunk_np = audio_np[start:end]
|
104 |
-
chunk_tensor = torch.from_numpy(chunk_np).float().to(DEVICE)
|
105 |
-
content.append({"array": chunk_tensor, "sampling_rate": sample_rate})
|
106 |
elif filetype == "Audio":
|
107 |
audio_np, sample_rate = librosa.load(input, sr=16000, mono=True)
|
108 |
chunk_tensor = torch.from_numpy(audio_np).float().to(DEVICE)
|
|
|
32 |
'''
|
33 |
|
34 |
input_prefixes = {
|
35 |
+
"Image": "(A image file called β has been attached, describe the image content) ",
|
36 |
+
"GIF": "(A GIF file called β has been attached, describe the GIF content) ",
|
37 |
+
"Video": "(A video with audio file called β has been attached, describe the video content and the audio content embedded into the video) ",
|
38 |
+
"Audio": "(A audio file called β has been attached, describe the audio content) ",
|
39 |
}
|
40 |
|
41 |
filetypes = {
|
|
|
49 |
gap = len(idxs) / n
|
50 |
return [idxs[int(i * gap + gap / 2)] for i in range(n)]
|
51 |
|
52 |
+
def build_omni_chunks(path, sr=16000, seconds_per_unit=1):
|
53 |
+
vr = VideoReader(path, ctx=cpu(0))
|
54 |
+
fps = round(vr.get_avg_fps())
|
55 |
+
audio_np, _ = librosa.load(path, sr=sr, mono=True)
|
56 |
+
total_units = math.ceil(len(vr) / fps / seconds_per_unit)
|
57 |
+
content = []
|
58 |
+
for i in range(total_units):
|
59 |
+
frame = Image.fromarray(vr[int(i * fps * seconds_per_unit)].asnumpy().astype("uint8"))
|
60 |
+
audio_chunk = audio_np[sr * i * seconds_per_unit : sr * (i + 1) * seconds_per_unit]
|
61 |
+
content.extend(["<unit>", frame, audio_chunk])
|
62 |
+
return content
|
63 |
+
|
64 |
def encode_video(path):
|
65 |
vr = VideoReader(path, ctx=cpu(0))
|
66 |
fps = round(vr.get_avg_fps())
|
|
|
100 |
frames = encode_gif(input)
|
101 |
content.extend(frames)
|
102 |
elif filetype == "Video":
|
103 |
+
omni_content = build_omni_chunks(input) + [instruction]
|
104 |
+
sys_msg = repo.get_sys_prompt(mode="omni", language="en")
|
105 |
+
msgs = [sys_msg, {"role": "user", "content": omni_content}]
|
106 |
+
params = dict(msgs=msgs, tokenizer=tokenizer, omni_input=True, **kw)
|
107 |
+
return repo.chat(**params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
elif filetype == "Audio":
|
109 |
audio_np, sample_rate = librosa.load(input, sr=16000, mono=True)
|
110 |
chunk_tensor = torch.from_numpy(audio_np).float().to(DEVICE)
|