Spaces:

Salimshakeel
/

video-summarization-backend

Sleeping

App Files Files Community

Salimshakeel commited on Jul 14

Commit

1579b70

1 Parent(s): 7cd255e

fixing

Browse files

Files changed (4) hide show

routes/summarize.py +2 -5
services/extractor.py +16 -15
services/model_loader.py +11 -0
services/summarizer.py +1 -1

routes/summarize.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import APIRouter, UploadFile, File
 from fastapi.responses import JSONResponse
-from services.extractor import extract_frames, extract_features
 from services.summarizer import get_scores, get_selected_indices, save_summary_video
 from uuid import uuid4
 import time
@@ -25,11 +25,8 @@ def summarize_video(video: UploadFile = File(...)):
     with open(filepath, "wb") as f:
         f.write(video.file.read())
-    print("\n-----------> Extracting Frames ....")
-    frames, picks = extract_frames(filepath)
     print("\n-----------> Extracting Features ....")
-    features = extract_features(frames)
     print("\n-----------> Getting Scores ....")
     scores = get_scores(features)

 from fastapi import APIRouter, UploadFile, File
 from fastapi.responses import JSONResponse
+from services.extractor import extract_features
 from services.summarizer import get_scores, get_selected_indices, save_summary_video
 from uuid import uuid4
 import time
     with open(filepath, "wb") as f:
         f.write(video.file.read())
     print("\n-----------> Extracting Features ....")
+    features, picks = extract_features(filepath)
     print("\n-----------> Getting Scores ....")
     scores = get_scores(features)

services/extractor.py CHANGED Viewed

@@ -5,6 +5,7 @@ from PIL import Image
 from torchvision import models, transforms
 from config import DEVICE, FRAME_RATE
 from tqdm import tqdm
 # Load GoogLeNet once
 from torchvision.models import GoogLeNet_Weights
@@ -31,6 +32,7 @@ feature_extractor = torch.nn.Sequential(
     googlenet.avgpool,
     torch.nn.Flatten()
 )
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
@@ -41,33 +43,32 @@ transform = transforms.Compose([
     )
 ])
-def extract_frames(video_path):
     cap = cv2.VideoCapture(video_path)
     frames = []
     indices = []
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     # total_frames = 300 # TEMP
     print(f"Total frames in video: {total_frames}")
-    print(f"Extracting frames at every {FRAME_RATE} frames...")
-    for idx in tqdm(range(0, total_frames, FRAME_RATE)):
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if not ret:
             break
-        frames.append(Image.fromarray(frame))
-        indices.append(idx)
-    print(f"Indices of extracted frames: {indices}")
-    print(f"Total frames extracted: {len(frames)}")
     cap.release()
-    return frames, indices
-def extract_features(frames):
-    features = [transform(frame) for frame in frames]
-    features = torch.stack(features).to(DEVICE)
-    print("Features before GoogleNet extraction:", features.shape)
-    features = feature_extractor(features)
-    print("Features after GoogleNet extraction:", features.shape)
-    return features

 from torchvision import models, transforms
 from config import DEVICE, FRAME_RATE
 from tqdm import tqdm
+from services.model_loader import batch_inference
 # Load GoogLeNet once
 from torchvision.models import GoogLeNet_Weights
     googlenet.avgpool,
     torch.nn.Flatten()
 )
+feature_extractor = feature_extractor.eval()
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     )
 ])
+def extract_features(video_path):
     cap = cv2.VideoCapture(video_path)
     frames = []
     indices = []
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     # total_frames = 300 # TEMP
     print(f"Total frames in video: {total_frames}")
+    for idx in tqdm(range(total_frames)):
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if not ret:
             break
+        # process frame
+        frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        frame = transform(frame)
+        frames.append(frame)
+        indices.append(idx)
     cap.release()
+    frames = torch.stack(frames).to(DEVICE)
+    print("Features before GoogleNet extraction:", frames.shape)
+    frames = batch_inference(model=feature_extractor, input=frames, batch_size=32)
+    print("Features after GoogleNet extraction:", frames.shape)
+    return frames, indices

services/model_loader.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
 from layers.summarizer import PGL_SUM
 from config import DEVICE
 def load_model(weights_path):
     model = PGL_SUM(
@@ -17,3 +18,13 @@ def load_model(weights_path):
     model.load_state_dict(torch.load(weights_path, map_location=DEVICE))
     model.eval()
     return model

 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
 from layers.summarizer import PGL_SUM
 from config import DEVICE
+from tqdm import tqdm
 def load_model(weights_path):
     model = PGL_SUM(
     model.load_state_dict(torch.load(weights_path, map_location=DEVICE))
     model.eval()
     return model
+def batch_inference(model, input, batch_size=128):
+    model.eval()
+    output = []
+    with torch.no_grad():
+        for i in tqdm(range(0, input.size(0), batch_size)):
+            batch = input[i:i + batch_size].to(DEVICE)
+            out = model(batch)
+            output.append(out.cpu())
+    return torch.cat(output, dim=0)

services/summarizer.py CHANGED Viewed

@@ -60,7 +60,7 @@ def save_summary_video(video_path, selected_indices, output_path, fps=15):
     out.release()
     print("Fixing the video with ffmpeg")
-    # fix_video_with_ffmpeg(output_path)
 def fix_video_with_ffmpeg(path):
     temp_path = path + ".fixed.mp4"

     out.release()
     print("Fixing the video with ffmpeg")
+    fix_video_with_ffmpeg(output_path)
 def fix_video_with_ffmpeg(path):
     temp_path = path + ".fixed.mp4"