Spaces:
Running
Running
Upload 32 files
Browse files- DenseAV/denseav/aggregators.py +1 -1
- DenseAV/denseav/aligners.py +1 -1
- DenseAV/denseav/data/AVDatasets.py +7 -3
- DenseAV/denseav/data/make_tarballs.py +3 -2
- DenseAV/denseav/eval_utils.py +1 -1
- DenseAV/denseav/evaluate.py +2 -2
- DenseAV/denseav/plotting.py +1 -1
- DenseAV/denseav/shared.py +15 -15
- DenseAV/denseav/train.py +5 -5
DenseAV/denseav/aggregators.py
CHANGED
@@ -6,7 +6,7 @@ import torch.nn as nn
|
|
6 |
import torch.nn.functional as F
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
-
from
|
10 |
|
11 |
|
12 |
@torch.jit.script
|
|
|
6 |
import torch.nn.functional as F
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
+
from constants import *
|
10 |
|
11 |
|
12 |
@torch.jit.script
|
DenseAV/denseav/aligners.py
CHANGED
@@ -4,7 +4,7 @@ import torch
|
|
4 |
import torch.nn.functional as F
|
5 |
from torch.nn import ModuleList
|
6 |
|
7 |
-
from
|
8 |
|
9 |
|
10 |
class ChannelNorm(torch.nn.Module):
|
|
|
4 |
import torch.nn.functional as F
|
5 |
from torch.nn import ModuleList
|
6 |
|
7 |
+
from featurizers.DINO import Block
|
8 |
|
9 |
|
10 |
class ChannelNorm(torch.nn.Module):
|
DenseAV/denseav/data/AVDatasets.py
CHANGED
@@ -18,9 +18,13 @@ from PIL import Image
|
|
18 |
from torch.utils.data import Dataset, DataLoader, default_collate, Subset, ConcatDataset
|
19 |
from tqdm import tqdm
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def sample_choice(choices, probs):
|
|
|
18 |
from torch.utils.data import Dataset, DataLoader, default_collate, Subset, ConcatDataset
|
19 |
from tqdm import tqdm
|
20 |
|
21 |
+
import sys
|
22 |
+
sys.path.append('../constants')
|
23 |
+
sys.path.append('../shared')
|
24 |
+
|
25 |
+
from constants import AUDIO_MASK, AUDIO_POS_MASK, IMAGE_MASK, IMAGE_INPUT
|
26 |
+
from make_tarballs import untar_all
|
27 |
+
from shared import norm, prep_waveform
|
28 |
|
29 |
|
30 |
def sample_choice(choices, probs):
|
DenseAV/denseav/data/make_tarballs.py
CHANGED
@@ -9,11 +9,12 @@ from torch.utils.data import Dataset, DataLoader
|
|
9 |
from tqdm import tqdm
|
10 |
from pathlib import Path
|
11 |
|
12 |
-
from denseav.shared import batch
|
13 |
-
|
14 |
import tempfile
|
15 |
import shutil
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
class Tarballer(Dataset):
|
19 |
|
|
|
9 |
from tqdm import tqdm
|
10 |
from pathlib import Path
|
11 |
|
|
|
|
|
12 |
import tempfile
|
13 |
import shutil
|
14 |
|
15 |
+
import sys
|
16 |
+
sys.path.append('../shared')
|
17 |
+
from shared import batch
|
18 |
|
19 |
class Tarballer(Dataset):
|
20 |
|
DenseAV/denseav/eval_utils.py
CHANGED
@@ -9,7 +9,7 @@ from torchmetrics.functional.classification import binary_average_precision
|
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
from constants import *
|
12 |
-
from
|
13 |
|
14 |
|
15 |
def prep_heatmap(sims, masks, h, w):
|
|
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
from constants import *
|
12 |
+
from shared import unnorm, remove_axes
|
13 |
|
14 |
|
15 |
def prep_heatmap(sims, masks, h, w):
|
DenseAV/denseav/evaluate.py
CHANGED
@@ -4,8 +4,8 @@ from omegaconf import DictConfig, OmegaConf
|
|
4 |
from pytorch_lightning import Trainer
|
5 |
from pytorch_lightning import seed_everything
|
6 |
from pytorch_lightning.loggers import TensorBoardLogger
|
7 |
-
from
|
8 |
-
from
|
9 |
|
10 |
|
11 |
@hydra.main(config_path="configs", config_name="av_align.yaml")
|
|
|
4 |
from pytorch_lightning import Trainer
|
5 |
from pytorch_lightning import seed_everything
|
6 |
from pytorch_lightning.loggers import TensorBoardLogger
|
7 |
+
from data.AVDatasets import AVDataModule
|
8 |
+
from shared import load_trained_model
|
9 |
|
10 |
|
11 |
@hydra.main(config_path="configs", config_name="av_align.yaml")
|
DenseAV/denseav/plotting.py
CHANGED
@@ -10,7 +10,7 @@ import torch.nn.functional as F
|
|
10 |
import torchvision
|
11 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
12 |
from base64 import b64encode
|
13 |
-
from
|
14 |
|
15 |
|
16 |
def write_video_with_audio(video_frames, audio_array, video_fps, audio_fps, output_path):
|
|
|
10 |
import torchvision
|
11 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
12 |
from base64 import b64encode
|
13 |
+
from shared import pca
|
14 |
|
15 |
|
16 |
def write_video_with_audio(video_frames, audio_array, video_fps, audio_fps, output_path):
|
DenseAV/denseav/shared.py
CHANGED
@@ -90,37 +90,37 @@ def get_image_featurizer(name, token_type="key", **kwargs):
|
|
90 |
name = name.lower()
|
91 |
|
92 |
if name == "vit":
|
93 |
-
from
|
94 |
patch_size = 16
|
95 |
model = DINOFeaturizer("vit_small_patch16_224", patch_size, token_type)
|
96 |
dim = 384
|
97 |
elif name == "dino16":
|
98 |
-
from
|
99 |
patch_size = 16
|
100 |
model = DINOFeaturizer("dino_vits16", patch_size, token_type)
|
101 |
dim = 384
|
102 |
elif name == "dino8":
|
103 |
-
from
|
104 |
patch_size = 8
|
105 |
model = DINOFeaturizer("dino_vits8", patch_size, token_type)
|
106 |
dim = 384
|
107 |
elif name == "clip":
|
108 |
-
from
|
109 |
patch_size = 16
|
110 |
model = CLIPFeaturizer()
|
111 |
dim = 512
|
112 |
elif name == "cavmae":
|
113 |
-
from
|
114 |
model = CAVMAEImageFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
115 |
dim = 768
|
116 |
patch_size = 16
|
117 |
elif name == "fnac":
|
118 |
-
from
|
119 |
model = FNACImageFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
120 |
dim = 512
|
121 |
patch_size = 16
|
122 |
elif name == "imagebind":
|
123 |
-
from
|
124 |
model = ImageBindImageFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
125 |
dim = 1024
|
126 |
patch_size = 16
|
@@ -131,12 +131,12 @@ def get_image_featurizer(name, token_type="key", **kwargs):
|
|
131 |
patch_size = 1
|
132 |
dim = 2048
|
133 |
elif name == "davenet":
|
134 |
-
from
|
135 |
model = DavenetImageFeaturizer()
|
136 |
patch_size = 1
|
137 |
dim = 1024
|
138 |
elif name == "dinov2":
|
139 |
-
from
|
140 |
model = DINOv2Featurizer()
|
141 |
patch_size = 14
|
142 |
dim = 768
|
@@ -147,29 +147,29 @@ def get_image_featurizer(name, token_type="key", **kwargs):
|
|
147 |
|
148 |
def get_audio_featurizer(name, **kwargs):
|
149 |
if name == "davenet":
|
150 |
-
from
|
151 |
model = DavenetAudioFeaturizer()
|
152 |
dim = 1024
|
153 |
elif name == "dino8":
|
154 |
model, _, dim = get_image_featurizer("dino8")
|
155 |
elif name == "hubert":
|
156 |
-
from
|
157 |
model = Hubert()
|
158 |
dim = 1024
|
159 |
elif name == "cavmae":
|
160 |
-
from
|
161 |
model = CAVMAEAudioFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
162 |
dim = 768
|
163 |
elif name == "imagebind":
|
164 |
-
from
|
165 |
model = ImageBindAudioFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
166 |
dim = 1024
|
167 |
elif name == "audiomae":
|
168 |
-
from
|
169 |
model = AudioMAE(kwargs["output_root"], False)
|
170 |
dim = 768
|
171 |
elif name == "audiomae-finetuned":
|
172 |
-
from
|
173 |
model = AudioMAE(kwargs["output_root"], True)
|
174 |
dim = 768
|
175 |
else:
|
|
|
90 |
name = name.lower()
|
91 |
|
92 |
if name == "vit":
|
93 |
+
from featurizers.DINO import DINOFeaturizer
|
94 |
patch_size = 16
|
95 |
model = DINOFeaturizer("vit_small_patch16_224", patch_size, token_type)
|
96 |
dim = 384
|
97 |
elif name == "dino16":
|
98 |
+
from featurizers.DINO import DINOFeaturizer
|
99 |
patch_size = 16
|
100 |
model = DINOFeaturizer("dino_vits16", patch_size, token_type)
|
101 |
dim = 384
|
102 |
elif name == "dino8":
|
103 |
+
from featurizers.DINO import DINOFeaturizer
|
104 |
patch_size = 8
|
105 |
model = DINOFeaturizer("dino_vits8", patch_size, token_type)
|
106 |
dim = 384
|
107 |
elif name == "clip":
|
108 |
+
from featurizers.CLIP import CLIPFeaturizer
|
109 |
patch_size = 16
|
110 |
model = CLIPFeaturizer()
|
111 |
dim = 512
|
112 |
elif name == "cavmae":
|
113 |
+
from featurizers.CAVMAE import CAVMAEImageFeaturizer
|
114 |
model = CAVMAEImageFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
115 |
dim = 768
|
116 |
patch_size = 16
|
117 |
elif name == "fnac":
|
118 |
+
from featurizers.FNACAVL import FNACImageFeaturizer
|
119 |
model = FNACImageFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
120 |
dim = 512
|
121 |
patch_size = 16
|
122 |
elif name == "imagebind":
|
123 |
+
from featurizers.ImageBind import ImageBindImageFeaturizer
|
124 |
model = ImageBindImageFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
125 |
dim = 1024
|
126 |
patch_size = 16
|
|
|
131 |
patch_size = 1
|
132 |
dim = 2048
|
133 |
elif name == "davenet":
|
134 |
+
from featurizers.DAVENet import DavenetImageFeaturizer
|
135 |
model = DavenetImageFeaturizer()
|
136 |
patch_size = 1
|
137 |
dim = 1024
|
138 |
elif name == "dinov2":
|
139 |
+
from featurizers.DINOv2 import DINOv2Featurizer
|
140 |
model = DINOv2Featurizer()
|
141 |
patch_size = 14
|
142 |
dim = 768
|
|
|
147 |
|
148 |
def get_audio_featurizer(name, **kwargs):
|
149 |
if name == "davenet":
|
150 |
+
from featurizers.DAVENet import DavenetAudioFeaturizer
|
151 |
model = DavenetAudioFeaturizer()
|
152 |
dim = 1024
|
153 |
elif name == "dino8":
|
154 |
model, _, dim = get_image_featurizer("dino8")
|
155 |
elif name == "hubert":
|
156 |
+
from featurizers.Hubert import Hubert
|
157 |
model = Hubert()
|
158 |
dim = 1024
|
159 |
elif name == "cavmae":
|
160 |
+
from featurizers.CAVMAE import CAVMAEAudioFeaturizer
|
161 |
model = CAVMAEAudioFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
162 |
dim = 768
|
163 |
elif name == "imagebind":
|
164 |
+
from featurizers.ImageBind import ImageBindAudioFeaturizer
|
165 |
model = ImageBindAudioFeaturizer(kwargs["output_root"], model=kwargs.get("model"))
|
166 |
dim = 1024
|
167 |
elif name == "audiomae":
|
168 |
+
from featurizers.AudioMAE import AudioMAE
|
169 |
model = AudioMAE(kwargs["output_root"], False)
|
170 |
dim = 768
|
171 |
elif name == "audiomae-finetuned":
|
172 |
+
from featurizers.AudioMAE import AudioMAE
|
173 |
model = AudioMAE(kwargs["output_root"], True)
|
174 |
dim = 768
|
175 |
else:
|
DenseAV/denseav/train.py
CHANGED
@@ -21,11 +21,11 @@ from torchmetrics.functional.classification import binary_average_precision
|
|
21 |
|
22 |
from huggingface_hub import PyTorchModelHubMixin
|
23 |
|
24 |
-
from
|
25 |
-
from
|
26 |
-
from
|
27 |
-
from
|
28 |
-
from
|
29 |
get_image_featurizer, get_audio_featurizer, RollingAvg, create_model_from_cfg
|
30 |
|
31 |
torch.multiprocessing.set_sharing_strategy('file_system')
|
|
|
21 |
|
22 |
from huggingface_hub import PyTorchModelHubMixin
|
23 |
|
24 |
+
from aggregators import get_aggregator
|
25 |
+
from aligners import get_aligner, ProgressiveGrowing
|
26 |
+
from constants import *
|
27 |
+
from data.AVDatasets import AVDataModule
|
28 |
+
from shared import flatten_preds, GatherLayer, \
|
29 |
get_image_featurizer, get_audio_featurizer, RollingAvg, create_model_from_cfg
|
30 |
|
31 |
torch.multiprocessing.set_sharing_strategy('file_system')
|