salimshakeel
commited on
Commit
·
d2542a3
1
Parent(s):
30508a4
upload files
Browse files- .gitattributes +173 -35
- Dockerfile +15 -0
- Procfile +1 -0
- __init__.py +0 -0
- config.py +7 -0
- layers/attention.py +132 -0
- layers/summarizer.py +139 -0
- main.py +26 -0
- routes/__init__.py +0 -0
- routes/summarize.py +28 -0
- services/__init__.py +0 -0
- services/extractor.py +62 -0
- services/model_loader.py +19 -0
- services/summarizer.py +65 -0
- static/uploads/77ea55af6d744160a5c7e8440b294bb6_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4 +3 -0
- static/uploads/84daab3df51f418ebff312b2ed129bc1_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4 +3 -0
- static/uploads/8ba4aec007f5404db2e9ac9570e59ca6_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4 +3 -0
- static/uploads/b0b93f4bcdcb4662865bb4dc26c1b243_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4 +3 -0
- static/uploads/e051610a8a634fd9a9de3c016d38ce73_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4 +3 -0
- utils/__init__.py +0 -0
- utils/file_utils.py +10 -0
.gitattributes
CHANGED
@@ -1,35 +1,173 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
*.
|
4 |
-
|
5 |
-
|
6 |
-
*.
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
*.
|
23 |
-
|
24 |
-
*.
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
*.
|
30 |
-
*.
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[codz]
|
4 |
+
*$py.class
|
5 |
+
# C extensions
|
6 |
+
*.so
|
7 |
+
# Distribution / packaging
|
8 |
+
.Python
|
9 |
+
build/
|
10 |
+
develop-eggs/
|
11 |
+
dist/
|
12 |
+
downloads/
|
13 |
+
eggs/
|
14 |
+
.eggs/
|
15 |
+
lib/
|
16 |
+
lib64/
|
17 |
+
parts/
|
18 |
+
sdist/
|
19 |
+
var/
|
20 |
+
wheels/
|
21 |
+
share/python-wheels/
|
22 |
+
*.egg-info/
|
23 |
+
.installed.cfg
|
24 |
+
*.egg
|
25 |
+
MANIFEST
|
26 |
+
# PyInstaller
|
27 |
+
# Usually these files are written by a python script from a template
|
28 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
29 |
+
*.manifest
|
30 |
+
*.spec
|
31 |
+
# Installer logs
|
32 |
+
pip-log.txt
|
33 |
+
pip-delete-this-directory.txt
|
34 |
+
# Unit test / coverage reports
|
35 |
+
htmlcov/
|
36 |
+
.tox/
|
37 |
+
.nox/
|
38 |
+
.coverage
|
39 |
+
.coverage.*
|
40 |
+
.cache
|
41 |
+
nosetests.xml
|
42 |
+
coverage.xml
|
43 |
+
*.cover
|
44 |
+
*.py.cover
|
45 |
+
.hypothesis/
|
46 |
+
.pytest_cache/
|
47 |
+
cover/
|
48 |
+
# Translations
|
49 |
+
*.mo
|
50 |
+
*.pot
|
51 |
+
# Django stuff:
|
52 |
+
*.log
|
53 |
+
local_settings.py
|
54 |
+
db.sqlite3
|
55 |
+
db.sqlite3-journal
|
56 |
+
# Flask stuff:
|
57 |
+
instance/
|
58 |
+
.webassets-cache
|
59 |
+
# Scrapy stuff:
|
60 |
+
.scrapy
|
61 |
+
# Sphinx documentation
|
62 |
+
docs/_build/
|
63 |
+
# PyBuilder
|
64 |
+
.pybuilder/
|
65 |
+
target/
|
66 |
+
# Jupyter Notebook
|
67 |
+
.ipynb_checkpoints
|
68 |
+
# IPython
|
69 |
+
profile_default/
|
70 |
+
ipython_config.py
|
71 |
+
# pyenv
|
72 |
+
# For a library or package, you might want to ignore these files since the code is
|
73 |
+
# intended to run in multiple environments; otherwise, check them in:
|
74 |
+
# .python-version
|
75 |
+
# pipenv
|
76 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
77 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
78 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
79 |
+
# install all needed dependencies.
|
80 |
+
#Pipfile.lock
|
81 |
+
# UV
|
82 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
83 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
84 |
+
# commonly ignored for libraries.
|
85 |
+
#uv.lock
|
86 |
+
# poetry
|
87 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
88 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
89 |
+
# commonly ignored for libraries.
|
90 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
91 |
+
#poetry.lock
|
92 |
+
#poetry.toml
|
93 |
+
# pdm
|
94 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
95 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
96 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
97 |
+
#pdm.lock
|
98 |
+
#pdm.toml
|
99 |
+
.pdm-python
|
100 |
+
.pdm-build/
|
101 |
+
# pixi
|
102 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
103 |
+
#pixi.lock
|
104 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
105 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
106 |
+
.pixi
|
107 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
108 |
+
__pypackages__/
|
109 |
+
# Celery stuff
|
110 |
+
celerybeat-schedule
|
111 |
+
celerybeat.pid
|
112 |
+
# SageMath parsed files
|
113 |
+
*.sage.py
|
114 |
+
# Environments
|
115 |
+
.env
|
116 |
+
.envrc
|
117 |
+
.venv
|
118 |
+
env/
|
119 |
+
venv/
|
120 |
+
ENV/
|
121 |
+
env.bak/
|
122 |
+
venv.bak/
|
123 |
+
# Spyder project settings
|
124 |
+
.spyderproject
|
125 |
+
.spyproject
|
126 |
+
# Rope project settings
|
127 |
+
.ropeproject
|
128 |
+
# mkdocs documentation
|
129 |
+
/site
|
130 |
+
# mypy
|
131 |
+
.mypy_cache/
|
132 |
+
.dmypy.json
|
133 |
+
dmypy.json
|
134 |
+
# Pyre type checker
|
135 |
+
.pyre/
|
136 |
+
# pytype static type analyzer
|
137 |
+
.pytype/
|
138 |
+
# Cython debug symbols
|
139 |
+
cython_debug/
|
140 |
+
# PyCharm
|
141 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
142 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
143 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
144 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
145 |
+
#.idea/
|
146 |
+
# Abstra
|
147 |
+
# Abstra is an AI-powered process automation framework.
|
148 |
+
# Ignore directories containing user credentials, local state, and settings.
|
149 |
+
# Learn more at https://abstra.io/docs
|
150 |
+
.abstra/
|
151 |
+
# Visual Studio Code
|
152 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
153 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
154 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
155 |
+
# you could uncomment the following to ignore the entire vscode folder
|
156 |
+
# .vscode/
|
157 |
+
# Ruff stuff:
|
158 |
+
.ruff_cache/
|
159 |
+
# PyPI configuration file
|
160 |
+
.pypirc
|
161 |
+
# Cursor
|
162 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
163 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
164 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
165 |
+
.cursorignore
|
166 |
+
.cursorindexingignore
|
167 |
+
# Marimo
|
168 |
+
marimo/_static/
|
169 |
+
marimo/_lsp/
|
170 |
+
__marimo__/
|
171 |
+
# Streamlit
|
172 |
+
.streamlit/secrets.toml
|
173 |
+
static/uploads/* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# You will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.12-slim
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir --upgrade pip \
|
11 |
+
&& pip install --no-cache-dir -r /code/requirements.txt
|
12 |
+
|
13 |
+
COPY . .
|
14 |
+
|
15 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
Procfile
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
web: gunicorn -w 3 -k uvicorn.workers.UvicornWorker main:app
|
__init__.py
ADDED
File without changes
|
config.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config.py
|
2 |
+
import torch
|
3 |
+
UPLOAD_DIR = "backend/static/uploads"
|
4 |
+
OUTPUT_DIR = "backend/static/outputs"
|
5 |
+
FRAME_RATE = 15
|
6 |
+
SCORE_THRESHOLD = 0.4
|
7 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
layers/attention.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
class SelfAttention(nn.Module):
|
8 |
+
def __init__(self, input_size=1024, output_size=1024, freq=10000, heads=1, pos_enc=None):
|
9 |
+
""" The basic (multi-head) Attention 'cell' containing the learnable parameters of Q, K and V
|
10 |
+
|
11 |
+
:param int input_size: Feature input size of Q, K, V.
|
12 |
+
:param int output_size: Feature -hidden- size of Q, K, V.
|
13 |
+
:param int freq: The frequency of the sinusoidal positional encoding.
|
14 |
+
:param int heads: Number of heads for the attention module.
|
15 |
+
:param str | None pos_enc: The type of the positional encoding [supported: Absolute, Relative].
|
16 |
+
"""
|
17 |
+
super(SelfAttention, self).__init__()
|
18 |
+
|
19 |
+
self.permitted_encodings = ["absolute", "relative"]
|
20 |
+
if pos_enc is not None:
|
21 |
+
pos_enc = pos_enc.lower()
|
22 |
+
assert pos_enc in self.permitted_encodings, f"Supported encodings: {*self.permitted_encodings,}"
|
23 |
+
|
24 |
+
self.input_size = input_size
|
25 |
+
self.output_size = output_size
|
26 |
+
self.heads = heads
|
27 |
+
self.pos_enc = pos_enc
|
28 |
+
self.freq = freq
|
29 |
+
self.Wk, self.Wq, self.Wv = nn.ModuleList(), nn.ModuleList(), nn.ModuleList()
|
30 |
+
for _ in range(self.heads):
|
31 |
+
self.Wk.append(nn.Linear(in_features=input_size, out_features=output_size//heads, bias=False))
|
32 |
+
self.Wq.append(nn.Linear(in_features=input_size, out_features=output_size//heads, bias=False))
|
33 |
+
self.Wv.append(nn.Linear(in_features=input_size, out_features=output_size//heads, bias=False))
|
34 |
+
self.out = nn.Linear(in_features=output_size, out_features=input_size, bias=False)
|
35 |
+
|
36 |
+
self.softmax = nn.Softmax(dim=-1)
|
37 |
+
self.drop = nn.Dropout(p=0.5)
|
38 |
+
|
39 |
+
def getAbsolutePosition(self, T):
|
40 |
+
"""Calculate the sinusoidal positional encoding based on the absolute position of each considered frame.
|
41 |
+
Based on 'Attention is all you need' paper (https://arxiv.org/abs/1706.03762)
|
42 |
+
|
43 |
+
:param int T: Number of frames contained in Q, K and V
|
44 |
+
:return: Tensor with shape [T, T]
|
45 |
+
"""
|
46 |
+
freq = self.freq
|
47 |
+
d = self.input_size
|
48 |
+
|
49 |
+
pos = torch.tensor([k for k in range(T)], device=self.out.weight.device)
|
50 |
+
i = torch.tensor([k for k in range(T//2)], device=self.out.weight.device)
|
51 |
+
|
52 |
+
# Reshape tensors each pos_k for each i indices
|
53 |
+
pos = pos.reshape(pos.shape[0], 1)
|
54 |
+
pos = pos.repeat_interleave(i.shape[0], dim=1)
|
55 |
+
i = i.repeat(pos.shape[0], 1)
|
56 |
+
|
57 |
+
AP = torch.zeros(T, T, device=self.out.weight.device)
|
58 |
+
AP[pos, 2*i] = torch.sin(pos / freq ** ((2 * i) / d))
|
59 |
+
AP[pos, 2*i+1] = torch.cos(pos / freq ** ((2 * i) / d))
|
60 |
+
return AP
|
61 |
+
|
62 |
+
def getRelativePosition(self, T):
|
63 |
+
"""Calculate the sinusoidal positional encoding based on the relative position of each considered frame.
|
64 |
+
r_pos calculations as here: https://theaisummer.com/positional-embeddings/
|
65 |
+
|
66 |
+
:param int T: Number of frames contained in Q, K and V
|
67 |
+
:return: Tensor with shape [T, T]
|
68 |
+
"""
|
69 |
+
freq = self.freq
|
70 |
+
d = 2 * T
|
71 |
+
min_rpos = -(T - 1)
|
72 |
+
|
73 |
+
i = torch.tensor([k for k in range(T)], device=self.out.weight.device)
|
74 |
+
j = torch.tensor([k for k in range(T)], device=self.out.weight.device)
|
75 |
+
|
76 |
+
# Reshape tensors each i for each j indices
|
77 |
+
i = i.reshape(i.shape[0], 1)
|
78 |
+
i = i.repeat_interleave(i.shape[0], dim=1)
|
79 |
+
j = j.repeat(i.shape[0], 1)
|
80 |
+
|
81 |
+
# Calculate the relative positions
|
82 |
+
r_pos = j - i - min_rpos
|
83 |
+
|
84 |
+
RP = torch.zeros(T, T, device=self.out.weight.device)
|
85 |
+
idx = torch.tensor([k for k in range(T//2)], device=self.out.weight.device)
|
86 |
+
RP[:, 2*idx] = torch.sin(r_pos[:, 2*idx] / freq ** ((i[:, 2*idx] + j[:, 2*idx]) / d))
|
87 |
+
RP[:, 2*idx+1] = torch.cos(r_pos[:, 2*idx+1] / freq ** ((i[:, 2*idx+1] + j[:, 2*idx+1]) / d))
|
88 |
+
return RP
|
89 |
+
|
90 |
+
def forward(self, x):
|
91 |
+
""" Compute the weighted frame features, based on either the global or local (multi-head) attention mechanism.
|
92 |
+
|
93 |
+
:param torch.tensor x: Frame features with shape [T, input_size]
|
94 |
+
:return: A tuple of:
|
95 |
+
y: Weighted features based on the attention weights, with shape [T, input_size]
|
96 |
+
att_weights : The attention weights (before dropout), with shape [T, T]
|
97 |
+
"""
|
98 |
+
outputs = []
|
99 |
+
for head in range(self.heads):
|
100 |
+
K = self.Wk[head](x)
|
101 |
+
Q = self.Wq[head](x)
|
102 |
+
V = self.Wv[head](x)
|
103 |
+
|
104 |
+
# Q *= 0.06 # scale factor VASNet
|
105 |
+
# Q /= np.sqrt(self.output_size) # scale factor (i.e 1 / sqrt(d_k) )
|
106 |
+
energies = torch.matmul(Q, K.transpose(1, 0))
|
107 |
+
if self.pos_enc is not None:
|
108 |
+
if self.pos_enc == "absolute":
|
109 |
+
AP = self.getAbsolutePosition(T=energies.shape[0])
|
110 |
+
energies = energies + AP
|
111 |
+
elif self.pos_enc == "relative":
|
112 |
+
RP = self.getRelativePosition(T=energies.shape[0])
|
113 |
+
energies = energies + RP
|
114 |
+
|
115 |
+
att_weights = self.softmax(energies)
|
116 |
+
_att_weights = self.drop(att_weights)
|
117 |
+
y = torch.matmul(_att_weights, V)
|
118 |
+
|
119 |
+
# Save the current head output
|
120 |
+
outputs.append(y)
|
121 |
+
y = self.out(torch.cat(outputs, dim=1))
|
122 |
+
return y, att_weights.clone() # for now we don't deal with the weights (probably max or avg pooling)
|
123 |
+
|
124 |
+
|
125 |
+
if __name__ == '__main__':
|
126 |
+
pass
|
127 |
+
"""Uncomment for a quick proof of concept
|
128 |
+
model = SelfAttention(input_size=256, output_size=256, pos_enc="absolute").cuda()
|
129 |
+
_input = torch.randn(500, 256).cuda() # [seq_len, hidden_size]
|
130 |
+
output, weights = model(_input)
|
131 |
+
print(f"Output shape: {output.shape}\tattention shape: {weights.shape}")
|
132 |
+
"""
|
layers/summarizer.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import math
|
6 |
+
from .attention import SelfAttention
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
class MultiAttention(nn.Module):
|
11 |
+
def __init__(self, input_size=1024, output_size=1024, freq=10000, pos_enc=None,
|
12 |
+
num_segments=None, heads=1, fusion=None):
|
13 |
+
""" Class wrapping the MultiAttention part of PGL-SUM; its key modules and parameters.
|
14 |
+
|
15 |
+
:param int input_size: The expected input feature size.
|
16 |
+
:param int output_size: The hidden feature size of the attention mechanisms.
|
17 |
+
:param int freq: The frequency of the sinusoidal positional encoding.
|
18 |
+
:param None | str pos_enc: The selected positional encoding [absolute, relative].
|
19 |
+
:param None | int num_segments: The selected number of segments to split the videos.
|
20 |
+
:param int heads: The selected number of global heads.
|
21 |
+
:param None | str fusion: The selected type of feature fusion.
|
22 |
+
"""
|
23 |
+
super(MultiAttention, self).__init__()
|
24 |
+
|
25 |
+
# Global Attention, considering differences among all frames
|
26 |
+
self.attention = SelfAttention(input_size=input_size, output_size=output_size,
|
27 |
+
freq=freq, pos_enc=pos_enc, heads=heads)
|
28 |
+
|
29 |
+
self.num_segments = num_segments
|
30 |
+
if self.num_segments is not None:
|
31 |
+
assert self.num_segments >= 2, "num_segments must be None or 2+"
|
32 |
+
self.local_attention = nn.ModuleList()
|
33 |
+
for _ in range(self.num_segments):
|
34 |
+
# Local Attention, considering differences among the same segment with reduce hidden size
|
35 |
+
self.local_attention.append(SelfAttention(input_size=input_size, output_size=output_size//num_segments,
|
36 |
+
freq=freq, pos_enc=pos_enc, heads=4))
|
37 |
+
self.permitted_fusions = ["add", "mult", "avg", "max"]
|
38 |
+
self.fusion = fusion
|
39 |
+
if self.fusion is not None:
|
40 |
+
self.fusion = self.fusion.lower()
|
41 |
+
assert self.fusion in self.permitted_fusions, f"Fusion method must be: {*self.permitted_fusions,}"
|
42 |
+
|
43 |
+
def forward(self, x):
|
44 |
+
""" Compute the weighted frame features, based on the global and locals (multi-head) attention mechanisms.
|
45 |
+
|
46 |
+
:param torch.Tensor x: Tensor with shape [T, input_size] containing the frame features.
|
47 |
+
:return: A tuple of:
|
48 |
+
weighted_value: Tensor with shape [T, input_size] containing the weighted frame features.
|
49 |
+
attn_weights: Tensor with shape [T, T] containing the attention weights.
|
50 |
+
"""
|
51 |
+
weighted_value, attn_weights = self.attention(x) # global attention
|
52 |
+
|
53 |
+
if self.num_segments is not None and self.fusion is not None:
|
54 |
+
segment_size = math.ceil(x.shape[0] / self.num_segments)
|
55 |
+
for segment in range(self.num_segments):
|
56 |
+
left_pos = segment * segment_size
|
57 |
+
right_pos = (segment + 1) * segment_size
|
58 |
+
local_x = x[left_pos:right_pos]
|
59 |
+
weighted_local_value, attn_local_weights = self.local_attention[segment](local_x) # local attentions
|
60 |
+
|
61 |
+
# Normalize the features vectors
|
62 |
+
weighted_value[left_pos:right_pos] = F.normalize(weighted_value[left_pos:right_pos].clone(), p=2, dim=1)
|
63 |
+
weighted_local_value = F.normalize(weighted_local_value, p=2, dim=1)
|
64 |
+
if self.fusion == "add":
|
65 |
+
weighted_value[left_pos:right_pos] += weighted_local_value
|
66 |
+
elif self.fusion == "mult":
|
67 |
+
weighted_value[left_pos:right_pos] *= weighted_local_value
|
68 |
+
elif self.fusion == "avg":
|
69 |
+
weighted_value[left_pos:right_pos] += weighted_local_value
|
70 |
+
weighted_value[left_pos:right_pos] /= 2
|
71 |
+
elif self.fusion == "max":
|
72 |
+
weighted_value[left_pos:right_pos] = torch.max(weighted_value[left_pos:right_pos].clone(),
|
73 |
+
weighted_local_value)
|
74 |
+
|
75 |
+
return weighted_value, attn_weights
|
76 |
+
|
77 |
+
|
78 |
+
class PGL_SUM(nn.Module):
|
79 |
+
def __init__(self, input_size=1024, output_size=1024, freq=10000, pos_enc=None,
|
80 |
+
num_segments=None, heads=1, fusion=None):
|
81 |
+
""" Class wrapping the PGL-SUM model; its key modules and parameters.
|
82 |
+
|
83 |
+
:param int input_size: The expected input feature size.
|
84 |
+
:param int output_size: The hidden feature size of the attention mechanisms.
|
85 |
+
:param int freq: The frequency of the sinusoidal positional encoding.
|
86 |
+
:param None | str pos_enc: The selected positional encoding [absolute, relative].
|
87 |
+
:param None | int num_segments: The selected number of segments to split the videos.
|
88 |
+
:param int heads: The selected number of global heads.
|
89 |
+
:param None | str fusion: The selected type of feature fusion.
|
90 |
+
"""
|
91 |
+
super(PGL_SUM, self).__init__()
|
92 |
+
|
93 |
+
self.attention = MultiAttention(input_size=input_size, output_size=output_size, freq=freq,
|
94 |
+
pos_enc=pos_enc, num_segments=num_segments, heads=heads, fusion=fusion)
|
95 |
+
self.linear_1 = nn.Linear(in_features=input_size, out_features=input_size)
|
96 |
+
self.linear_2 = nn.Linear(in_features=self.linear_1.out_features, out_features=1)
|
97 |
+
|
98 |
+
self.drop = nn.Dropout(p=0.5)
|
99 |
+
self.norm_y = nn.LayerNorm(normalized_shape=input_size, eps=1e-6)
|
100 |
+
self.norm_linear = nn.LayerNorm(normalized_shape=self.linear_1.out_features, eps=1e-6)
|
101 |
+
self.relu = nn.ReLU()
|
102 |
+
self.sigmoid = nn.Sigmoid()
|
103 |
+
|
104 |
+
def forward(self, frame_features):
|
105 |
+
""" Produce frames importance scores from the frame features, using the PGL-SUM model.
|
106 |
+
|
107 |
+
:param torch.Tensor frame_features: Tensor of shape [T, input_size] containing the frame features produced by
|
108 |
+
using the pool5 layer of GoogleNet.
|
109 |
+
:return: A tuple of:
|
110 |
+
y: Tensor with shape [1, T] containing the frames importance scores in [0, 1].
|
111 |
+
attn_weights: Tensor with shape [T, T] containing the attention weights.
|
112 |
+
"""
|
113 |
+
residual = frame_features
|
114 |
+
weighted_value, attn_weights = self.attention(frame_features)
|
115 |
+
y = weighted_value + residual
|
116 |
+
y = self.drop(y)
|
117 |
+
y = self.norm_y(y)
|
118 |
+
|
119 |
+
# 2-layer NN (Regressor Network)
|
120 |
+
y = self.linear_1(y)
|
121 |
+
y = self.relu(y)
|
122 |
+
y = self.drop(y)
|
123 |
+
y = self.norm_linear(y)
|
124 |
+
|
125 |
+
y = self.linear_2(y)
|
126 |
+
y = self.sigmoid(y)
|
127 |
+
y = y.view(1, -1)
|
128 |
+
|
129 |
+
return y, attn_weights
|
130 |
+
|
131 |
+
|
132 |
+
if __name__ == '__main__':
|
133 |
+
pass
|
134 |
+
"""Uncomment for a quick proof of concept
|
135 |
+
model = PGL_SUM(input_size=256, output_size=256, num_segments=3, fusion="Add").cuda()
|
136 |
+
_input = torch.randn(500, 256).cuda() # [seq_len, hidden_size]
|
137 |
+
output, weights = model(_input)
|
138 |
+
print(f"Output shape: {output.shape}\tattention shape: {weights.shape}")
|
139 |
+
"""
|
main.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from routes import summarize
|
4 |
+
from fastapi.staticfiles import StaticFiles
|
5 |
+
from fastapi.responses import JSONResponse
|
6 |
+
import os
|
7 |
+
|
8 |
+
app = FastAPI()
|
9 |
+
app.include_router(summarize.router)
|
10 |
+
|
11 |
+
# ✅ Root route to avoid 404 on /
|
12 |
+
@app.get("/")
|
13 |
+
def read_root():
|
14 |
+
return JSONResponse(content={"message": "Video summarization API is running"})
|
15 |
+
|
16 |
+
# CORS
|
17 |
+
app.add_middleware(
|
18 |
+
CORSMiddleware,
|
19 |
+
allow_origins=["*"],
|
20 |
+
allow_methods=["*"],
|
21 |
+
allow_headers=["*"],
|
22 |
+
)
|
23 |
+
|
24 |
+
# Mount static folder
|
25 |
+
static_dir = os.path.join("backend", "static")
|
26 |
+
app.mount("/static", StaticFiles(directory=static_dir), name="static")
|
routes/__init__.py
ADDED
File without changes
|
routes/summarize.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, UploadFile, File
|
2 |
+
from fastapi.responses import JSONResponse
|
3 |
+
from utils.file_utils import save_uploaded_file
|
4 |
+
from services.extractor import extract_features
|
5 |
+
from services.model_loader import load_model
|
6 |
+
from services.summarizer import get_scores, get_selected_indices, save_summary_video
|
7 |
+
from config import UPLOAD_DIR, OUTPUT_DIR
|
8 |
+
|
9 |
+
router = APIRouter()
|
10 |
+
|
11 |
+
@router.post("/summarize")
|
12 |
+
def summarize_video(video: UploadFile = File(...)):
|
13 |
+
if not video.filename.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
|
14 |
+
return JSONResponse(content={"error": "Unsupported file format"}, status_code=400)
|
15 |
+
|
16 |
+
video_path = save_uploaded_file(video, UPLOAD_DIR)
|
17 |
+
features, picks = extract_features(video_path)
|
18 |
+
model = load_model("backend/Model/epoch-199.pkl")
|
19 |
+
scores = get_scores(model, features)
|
20 |
+
selected = get_selected_indices(scores, picks)
|
21 |
+
output_path = f"{OUTPUT_DIR}/summary_{video.filename}"
|
22 |
+
save_summary_video(video_path, selected, output_path)
|
23 |
+
summary_url = f"/static/outputs/summary_{video.filename}"
|
24 |
+
|
25 |
+
return JSONResponse(content={
|
26 |
+
"message": "Summarization complete",
|
27 |
+
"summary_video_url": summary_url
|
28 |
+
})
|
services/__init__.py
ADDED
File without changes
|
services/extractor.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
from torchvision import models, transforms
|
6 |
+
from config import DEVICE, FRAME_RATE
|
7 |
+
|
8 |
+
# Load GoogLeNet once
|
9 |
+
from torchvision.models import GoogLeNet_Weights
|
10 |
+
weights = GoogLeNet_Weights.DEFAULT
|
11 |
+
googlenet = models.googlenet(weights=weights).to(DEVICE).eval()
|
12 |
+
|
13 |
+
feature_extractor = torch.nn.Sequential(
|
14 |
+
googlenet.conv1,
|
15 |
+
googlenet.maxpool1,
|
16 |
+
googlenet.conv2,
|
17 |
+
googlenet.conv3,
|
18 |
+
googlenet.maxpool2,
|
19 |
+
googlenet.inception3a,
|
20 |
+
googlenet.inception3b,
|
21 |
+
googlenet.maxpool3,
|
22 |
+
googlenet.inception4a,
|
23 |
+
googlenet.inception4b,
|
24 |
+
googlenet.inception4c,
|
25 |
+
googlenet.inception4d,
|
26 |
+
googlenet.inception4e,
|
27 |
+
googlenet.maxpool4,
|
28 |
+
googlenet.inception5a,
|
29 |
+
googlenet.inception5b,
|
30 |
+
googlenet.avgpool,
|
31 |
+
torch.nn.Flatten()
|
32 |
+
)
|
33 |
+
|
34 |
+
transform = transforms.Compose([
|
35 |
+
transforms.Resize((224, 224)),
|
36 |
+
transforms.ToTensor(),
|
37 |
+
transforms.Normalize(
|
38 |
+
mean=[0.485, 0.456, 0.406],
|
39 |
+
std=[0.229, 0.224, 0.225]
|
40 |
+
)
|
41 |
+
])
|
42 |
+
|
43 |
+
def extract_features(video_path):
|
44 |
+
cap = cv2.VideoCapture(video_path)
|
45 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
46 |
+
picks, frames = [], []
|
47 |
+
count = 0
|
48 |
+
|
49 |
+
while cap.isOpened():
|
50 |
+
ret, frame = cap.read()
|
51 |
+
if not ret:
|
52 |
+
break
|
53 |
+
if int(count % round(fps // FRAME_RATE)) == 0:
|
54 |
+
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
55 |
+
input_tensor = transform(image).unsqueeze(0).to(DEVICE)
|
56 |
+
with torch.no_grad():
|
57 |
+
feature = feature_extractor(input_tensor).squeeze(0).cpu().numpy()
|
58 |
+
frames.append(feature)
|
59 |
+
picks.append(count)
|
60 |
+
count += 1
|
61 |
+
cap.release()
|
62 |
+
return np.stack(frames), picks
|
services/model_loader.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
|
5 |
+
from layers.summarizer import PGL_SUM
|
6 |
+
from config import DEVICE
|
7 |
+
|
8 |
+
def load_model(weights_path):
|
9 |
+
model = PGL_SUM(
|
10 |
+
input_size=1024,
|
11 |
+
output_size=1024,
|
12 |
+
num_segments=4,
|
13 |
+
heads=8,
|
14 |
+
fusion="add",
|
15 |
+
pos_enc="absolute"
|
16 |
+
).to(DEVICE)
|
17 |
+
model.load_state_dict(torch.load(weights_path, map_location=DEVICE))
|
18 |
+
model.eval
|
19 |
+
return model
|
services/summarizer.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import torch
|
3 |
+
from config import SCORE_THRESHOLD
|
4 |
+
|
5 |
+
def get_scores(model, features):
|
6 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
7 |
+
model = model.to(device)
|
8 |
+
with torch.no_grad():
|
9 |
+
features_tensor = torch.tensor(features, dtype=torch.float32).to(device)
|
10 |
+
scores, _ = model(features_tensor)
|
11 |
+
return scores.squeeze().cpu().numpy()
|
12 |
+
|
13 |
+
|
14 |
+
def get_selected_indices(scores, picks, threshold=SCORE_THRESHOLD):
|
15 |
+
return [picks[i] for i, score in enumerate(scores) if score >= threshold]
|
16 |
+
|
17 |
+
import subprocess
|
18 |
+
import os
|
19 |
+
|
20 |
+
def save_summary_video(video_path, selected_indices, output_path, fps=15):
|
21 |
+
import cv2
|
22 |
+
|
23 |
+
cap = cv2.VideoCapture(video_path)
|
24 |
+
selected = set(selected_indices)
|
25 |
+
frame_id = 0
|
26 |
+
frames = {}
|
27 |
+
|
28 |
+
while cap.isOpened():
|
29 |
+
ret, frame = cap.read()
|
30 |
+
if not ret:
|
31 |
+
break
|
32 |
+
if frame_id in selected:
|
33 |
+
frames[frame_id] = frame
|
34 |
+
frame_id += 1
|
35 |
+
cap.release()
|
36 |
+
|
37 |
+
if not frames:
|
38 |
+
print("No frames selected.")
|
39 |
+
return
|
40 |
+
|
41 |
+
h, w, _ = list(frames.values())[0].shape
|
42 |
+
|
43 |
+
# 1️⃣ Save raw video first
|
44 |
+
raw_output_path = output_path.replace(".mp4", "_raw.mp4")
|
45 |
+
writer = cv2.VideoWriter(raw_output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
|
46 |
+
for fid in sorted(frames.keys()):
|
47 |
+
writer.write(frames[fid])
|
48 |
+
writer.release()
|
49 |
+
|
50 |
+
# 2️⃣ Use FFmpeg to fix video (browser-compatible)
|
51 |
+
try:
|
52 |
+
subprocess.run([
|
53 |
+
"ffmpeg",
|
54 |
+
"-y", # overwrite if file exists
|
55 |
+
"-i", raw_output_path,
|
56 |
+
"-vcodec", "libx264",
|
57 |
+
"-acodec", "aac",
|
58 |
+
output_path
|
59 |
+
], check=True)
|
60 |
+
os.remove(raw_output_path) # optional: remove raw file
|
61 |
+
print(f"✅ FFmpeg re-encoded video saved to: {output_path}")
|
62 |
+
except subprocess.CalledProcessError as e:
|
63 |
+
print("❌ FFmpeg failed:", e)
|
64 |
+
print("⚠️ Using raw video instead.")
|
65 |
+
os.rename(raw_output_path, output_path)
|
static/uploads/77ea55af6d744160a5c7e8440b294bb6_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99183f4ca670013008f6a45943bf878532a1db1ad0753f671d289f55f45dac93
|
3 |
+
size 22890415
|
static/uploads/84daab3df51f418ebff312b2ed129bc1_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99183f4ca670013008f6a45943bf878532a1db1ad0753f671d289f55f45dac93
|
3 |
+
size 22890415
|
static/uploads/8ba4aec007f5404db2e9ac9570e59ca6_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99183f4ca670013008f6a45943bf878532a1db1ad0753f671d289f55f45dac93
|
3 |
+
size 22890415
|
static/uploads/b0b93f4bcdcb4662865bb4dc26c1b243_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99183f4ca670013008f6a45943bf878532a1db1ad0753f671d289f55f45dac93
|
3 |
+
size 22890415
|
static/uploads/e051610a8a634fd9a9de3c016d38ce73_Paris Saint-Germain vs Atlético de Madrid Highlights | FIFA Club World Cup 2025.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99183f4ca670013008f6a45943bf878532a1db1ad0753f671d289f55f45dac93
|
3 |
+
size 22890415
|
utils/__init__.py
ADDED
File without changes
|
utils/file_utils.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from uuid import uuid4
|
3 |
+
|
4 |
+
def save_uploaded_file(uploaded_file, upload_dir):
|
5 |
+
os.makedirs(upload_dir, exist_ok=True)
|
6 |
+
filename = f"{uuid4().hex}_{uploaded_file.filename}"
|
7 |
+
filepath = os.path.join(upload_dir, filename)
|
8 |
+
with open(filepath, "wb") as f:
|
9 |
+
f.write(uploaded_file.file.read())
|
10 |
+
return filepath
|