Spaces:
Sleeping
Sleeping
Fix: Add -movflags +faststart to yt-dlp to resolve moov atom error
Browse files- modal_whisper_app.py +276 -121
modal_whisper_app.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import modal
|
2 |
from fastapi import FastAPI, UploadFile, File, Body, Query
|
3 |
-
from
|
4 |
-
|
|
|
|
|
5 |
import os
|
6 |
import tempfile
|
7 |
import io # Used by Whisper for BytesIO
|
@@ -12,7 +14,8 @@ import hashlib
|
|
12 |
from fastapi.responses import JSONResponse
|
13 |
from fastapi.middleware.cors import CORSMiddleware
|
14 |
from pydantic import BaseModel
|
15 |
-
import re # For parsing search results
|
|
|
16 |
import asyncio # For concurrent video processing
|
17 |
|
18 |
import gradio as gr
|
@@ -28,7 +31,7 @@ OBJECT_DETECTION_MODEL_NAME = "facebook/detr-resnet-50"
|
|
28 |
OBJECT_DETECTION_PROCESSOR_NAME = "facebook/detr-resnet-50"
|
29 |
|
30 |
# --- Modal Image Definition ---
|
31 |
-
|
32 |
modal.Image.debian_slim(python_version="3.10")
|
33 |
.apt_install("ffmpeg")
|
34 |
.pip_install(
|
@@ -41,8 +44,10 @@ video_analysis_image = (
|
|
41 |
"torchvision",
|
42 |
"torchaudio",
|
43 |
"fastapi[standard]", # For web endpoints
|
44 |
-
"pydantic",
|
45 |
-
"
|
|
|
|
|
46 |
)
|
47 |
)
|
48 |
|
@@ -51,12 +56,16 @@ app = modal.App(name="video-analysis-gradio-pipeline") # New app name, using App
|
|
51 |
|
52 |
# --- Pydantic model for web endpoint request ---
|
53 |
class VideoAnalysisRequestPayload(BaseModel):
|
54 |
-
video_url: str
|
|
|
|
|
|
|
|
|
55 |
|
56 |
# --- Constants for Model Names ---
|
57 |
# WHISPER_MODEL_NAME = "openai/whisper-large-v3"
|
58 |
-
|
59 |
-
|
60 |
# # CAPTION_TOKENIZER_NAME = "gpt2" # For SpaceTimeGPT's text decoder (usually part of processor)
|
61 |
# ACTION_MODEL_NAME = "MCG-NJU/videomae-base-finetuned-kinetics"
|
62 |
# ACTION_PROCESSOR_NAME = "MCG-NJU/videomae-base" # Or VideoMAEImageProcessor.from_pretrained(ACTION_MODEL_NAME)
|
@@ -88,7 +97,7 @@ def _login_to_hf():
|
|
88 |
|
89 |
# === 1. Transcription with Whisper ===
|
90 |
@app.function(
|
91 |
-
image=
|
92 |
secrets=[HF_TOKEN_SECRET],
|
93 |
gpu="any",
|
94 |
timeout=600
|
@@ -142,7 +151,14 @@ def transcribe_video_with_whisper(video_bytes: bytes) -> str:
|
|
142 |
device="cuda:0" if torch.cuda.is_available() else "cpu",
|
143 |
)
|
144 |
print(f"[Whisper] Pipeline loaded. Transcribing {temp_audio_path}...")
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
transcription = outputs["text"]
|
147 |
print(f"[Whisper] Transcription successful: {transcription[:100]}...")
|
148 |
return transcription
|
@@ -159,7 +175,7 @@ def transcribe_video_with_whisper(video_bytes: bytes) -> str:
|
|
159 |
|
160 |
# === 2. Captioning with SpaceTimeGPT ===
|
161 |
@app.function(
|
162 |
-
image=
|
163 |
secrets=[HF_TOKEN_SECRET],
|
164 |
gpu="any",
|
165 |
timeout=600
|
@@ -167,7 +183,7 @@ def transcribe_video_with_whisper(video_bytes: bytes) -> str:
|
|
167 |
def generate_captions_with_spacetimegpt(video_bytes: bytes) -> str:
|
168 |
_login_to_hf()
|
169 |
import torch
|
170 |
-
from transformers import AutoProcessor,
|
171 |
import av
|
172 |
import numpy as np
|
173 |
import tempfile
|
@@ -191,14 +207,20 @@ def generate_captions_with_spacetimegpt(video_bytes: bytes) -> str:
|
|
191 |
indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
|
192 |
frames = []
|
193 |
for i in indices:
|
194 |
-
container.seek(i, stream=video_stream)
|
195 |
frame = next(container.decode(video_stream))
|
196 |
frames.append(frame.to_rgb().to_ndarray())
|
197 |
container.close()
|
198 |
video_frames_np = np.stack(frames)
|
199 |
|
200 |
processor = AutoProcessor.from_pretrained(CAPTION_PROCESSOR_NAME, trust_remote_code=True)
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
203 |
model.to(device)
|
204 |
if hasattr(processor, 'tokenizer'): # Check if tokenizer exists
|
@@ -224,7 +246,7 @@ def generate_captions_with_spacetimegpt(video_bytes: bytes) -> str:
|
|
224 |
|
225 |
# === 3. Action Recognition with VideoMAE ===
|
226 |
@app.function(
|
227 |
-
image=
|
228 |
secrets=[HF_TOKEN_SECRET],
|
229 |
gpu="any",
|
230 |
timeout=600
|
@@ -256,7 +278,7 @@ def generate_action_labels(video_bytes: bytes) -> List[Dict[str, Any]]:
|
|
256 |
indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
|
257 |
video_frames_list = []
|
258 |
for i in indices:
|
259 |
-
container.seek(i, stream=video_stream)
|
260 |
frame = next(container.decode(video_stream))
|
261 |
video_frames_list.append(frame.to_rgb().to_ndarray())
|
262 |
container.close()
|
@@ -297,7 +319,7 @@ def generate_action_labels(video_bytes: bytes) -> List[Dict[str, Any]]:
|
|
297 |
|
298 |
# === 4. Object Detection with DETR ===
|
299 |
@app.function(
|
300 |
-
image=
|
301 |
secrets=[HF_TOKEN_SECRET],
|
302 |
gpu="any",
|
303 |
timeout=600
|
@@ -337,7 +359,7 @@ def generate_object_detection(video_bytes: bytes) -> List[Dict[str, Any]]:
|
|
337 |
|
338 |
all_frame_detections = []
|
339 |
for frame_num, target_frame_index in enumerate(frame_indices):
|
340 |
-
container.seek(target_frame_index, stream=video_stream)
|
341 |
frame = next(container.decode(video_stream))
|
342 |
pil_image = frame.to_image()
|
343 |
|
@@ -376,7 +398,7 @@ def generate_object_detection(video_bytes: bytes) -> List[Dict[str, Any]]:
|
|
376 |
|
377 |
# === 5. Comprehensive Video Analysis (Orchestrator) ===
|
378 |
@app.function(
|
379 |
-
image=
|
380 |
secrets=[HF_TOKEN_SECRET],
|
381 |
gpu="any", # Request GPU as some sub-tasks will need it
|
382 |
timeout=1800, # Generous timeout for all models
|
@@ -388,7 +410,7 @@ async def analyze_video_comprehensive(video_bytes: bytes) -> Dict[str, Any]:
|
|
388 |
cache_key = hashlib.sha256(video_bytes).hexdigest()
|
389 |
|
390 |
try:
|
391 |
-
cached_result =
|
392 |
if cached_result:
|
393 |
print(f"[Orchestrator] Cache hit for key: {cache_key}")
|
394 |
return cached_result
|
@@ -402,35 +424,35 @@ async def analyze_video_comprehensive(video_bytes: bytes) -> Dict[str, Any]:
|
|
402 |
print("[Orchestrator] Calling transcription...")
|
403 |
try:
|
404 |
# .call() is synchronous in the context of the Modal function execution
|
405 |
-
results["transcription"] = transcribe_video_with_whisper.
|
406 |
except Exception as e:
|
407 |
print(f"[Orchestrator] Error in transcription: {e}")
|
408 |
results["transcription"] = f"Transcription Error: {str(e)}"
|
409 |
|
410 |
print("[Orchestrator] Calling captioning...")
|
411 |
try:
|
412 |
-
results["caption"] = generate_captions_with_spacetimegpt.
|
413 |
except Exception as e:
|
414 |
print(f"[Orchestrator] Error in captioning: {e}")
|
415 |
results["caption"] = f"Captioning Error: {str(e)}"
|
416 |
|
417 |
print("[Orchestrator] Calling action recognition...")
|
418 |
try:
|
419 |
-
results["actions"] = generate_action_labels.
|
420 |
except Exception as e:
|
421 |
print(f"[Orchestrator] Error in action recognition: {e}")
|
422 |
results["actions"] = [{"error": f"Action Recognition Error: {str(e)}"}] # Ensure list type for error
|
423 |
|
424 |
print("[Orchestrator] Calling object detection...")
|
425 |
try:
|
426 |
-
results["objects"] = generate_object_detection.
|
427 |
except Exception as e:
|
428 |
print(f"[Orchestrator] Error in object detection: {e}")
|
429 |
results["objects"] = [{"error": f"Object Detection Error: {str(e)}"}] # Ensure list type for error
|
430 |
|
431 |
print("[Orchestrator] All analyses attempted. Storing results in cache.")
|
432 |
try:
|
433 |
-
|
434 |
print(f"[Orchestrator] Successfully cached results for key: {cache_key}")
|
435 |
except Exception as e:
|
436 |
print(f"[Orchestrator] Cache PUT error: {e}")
|
@@ -439,13 +461,7 @@ async def analyze_video_comprehensive(video_bytes: bytes) -> Dict[str, Any]:
|
|
439 |
|
440 |
|
441 |
# === FastAPI Endpoint for Video Analysis ===
|
442 |
-
@
|
443 |
-
image=video_analysis_image,
|
444 |
-
secrets=[HF_TOKEN_SECRET],
|
445 |
-
gpu="any",
|
446 |
-
timeout=1800,
|
447 |
-
)
|
448 |
-
@modal.fastapi_endpoint(method="POST")
|
449 |
def process_video_analysis(payload: VideoAnalysisRequestPayload):
|
450 |
"""FastAPI endpoint for comprehensive video analysis."""
|
451 |
print(f"[FastAPI Endpoint] Received request for video analysis")
|
@@ -456,18 +472,138 @@ def process_video_analysis(payload: VideoAnalysisRequestPayload):
|
|
456 |
|
457 |
print(f"[FastAPI Endpoint] Processing video_url: {video_url}")
|
458 |
try:
|
459 |
-
# Download video
|
460 |
-
import
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
|
469 |
# Call comprehensive analysis
|
470 |
-
analysis_results = analyze_video_comprehensive.
|
471 |
print("[FastAPI Endpoint] Comprehensive analysis finished.")
|
472 |
return JSONResponse(status_code=200, content=analysis_results)
|
473 |
|
@@ -478,76 +614,105 @@ def process_video_analysis(payload: VideoAnalysisRequestPayload):
|
|
478 |
print(f"[FastAPI Endpoint] Unexpected Exception during analysis: {e}")
|
479 |
return JSONResponse(status_code=500, content={"error": f"Unexpected server error during analysis: {str(e)}"})
|
480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
# === 6. Topic-Based Video Search ===
|
482 |
@app.function(
|
483 |
-
image=
|
484 |
secrets=[HF_TOKEN_SECRET],
|
485 |
timeout=300
|
486 |
)
|
487 |
def find_video_urls_for_topic(topic: str, max_results: int = 3) -> List[str]:
|
488 |
-
"""Finds video URLs (YouTube
|
489 |
print(f"[TopicSearch] Finding video URLs for topic: '{topic}', max_results={max_results}")
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
# So, this Python function in `modal_whisper_app.py` mostly defines the signature and intent.
|
529 |
-
# We will rely on Cascade to make the actual search_web call and provide the results back to the orchestrator.
|
530 |
-
|
531 |
-
# This function, when called by Cascade, will trigger a `search_web` tool call.
|
532 |
-
# The tool call will be made by Cascade, not by the Modal runtime directly.
|
533 |
-
# For now, let's assume this function's body is a placeholder for that interaction.
|
534 |
-
# The key is that the *calling* function (e.g., analyze_videos_by_topic) will use .remote(),
|
535 |
-
# and Cascade will manage the search_web tool call.
|
536 |
-
|
537 |
-
# To make this runnable standalone (for testing Modal part without Cascade), one might add:
|
538 |
-
# if modal.is_local():
|
539 |
-
# # basic requests/bs4 search or return dummy data
|
540 |
-
# pass
|
541 |
-
|
542 |
-
# For the flow with Cascade, this function primarily serves as a named Modal function
|
543 |
-
# that Cascade understands it needs to provide search results for.
|
544 |
-
# The actual search logic is deferred to Cascade's tool execution.
|
545 |
-
# We will return an empty list here, expecting Cascade to populate it via its mechanisms when called.
|
546 |
-
print(f"[TopicSearch] Function '{find_video_urls_for_topic.__name__}' called. Expecting Cascade to perform web search.")
|
547 |
-
# This is a conceptual placeholder. The actual search will be done by Cascade's tool.
|
548 |
-
# When `analyze_videos_by_topic` calls `find_video_urls_for_topic.remote()`,
|
549 |
-
# Cascade will execute its `search_web` tool and the result will be used.
|
550 |
-
return [] # Placeholder: Cascade will provide actual URLs via its search_web tool.
|
551 |
|
552 |
# Helper function (not a Modal function) to extract video URLs from search results
|
553 |
def extract_video_urls_from_search(search_results: List[Dict[str, str]], max_urls: int = 3) -> List[str]:
|
@@ -588,25 +753,15 @@ def extract_video_urls_from_search(search_results: List[Dict[str, str]], max_url
|
|
588 |
break
|
589 |
if len(video_urls) >= max_urls:
|
590 |
break
|
591 |
-
|
592 |
-
print(f"[URL Extraction] Extracted {len(video_urls)} video URLs: {video_urls}")
|
593 |
-
return video_urls
|
594 |
-
|
595 |
-
|
596 |
# === 7. Topic-Based Video Analysis Orchestrator ===
|
597 |
@app.function(
|
598 |
-
image=
|
599 |
secrets=[HF_TOKEN_SECRET],
|
600 |
-
gpu="any",
|
601 |
-
timeout=3600
|
602 |
)
|
603 |
-
async def
|
604 |
-
|
605 |
-
print(f"[TopicAnalysisWorker] Processing video URL for topic '{topic}': {video_url}")
|
606 |
-
try:
|
607 |
-
# 1. Download video
|
608 |
-
print(f"[TopicAnalysisWorker] Downloading video from: {video_url}")
|
609 |
-
response = await client.get(video_url)
|
610 |
response.raise_for_status() # Raise HTTPError for bad responses (4XX or 5XX)
|
611 |
video_bytes = await response.aread()
|
612 |
print(f"[TopicAnalysisWorker] Downloaded {len(video_bytes)} bytes from {video_url}")
|
|
|
1 |
import modal
|
2 |
from fastapi import FastAPI, UploadFile, File, Body, Query
|
3 |
+
from fastapi.responses import JSONResponse
|
4 |
+
|
5 |
+
web_app = FastAPI(title="MCP Video Analysis API")
|
6 |
+
|
7 |
import os
|
8 |
import tempfile
|
9 |
import io # Used by Whisper for BytesIO
|
|
|
14 |
from fastapi.responses import JSONResponse
|
15 |
from fastapi.middleware.cors import CORSMiddleware
|
16 |
from pydantic import BaseModel
|
17 |
+
import re # For parsing search results
|
18 |
+
import yt_dlp
|
19 |
import asyncio # For concurrent video processing
|
20 |
|
21 |
import gradio as gr
|
|
|
31 |
OBJECT_DETECTION_PROCESSOR_NAME = "facebook/detr-resnet-50"
|
32 |
|
33 |
# --- Modal Image Definition ---
|
34 |
+
video_analysis_image_v2 = (
|
35 |
modal.Image.debian_slim(python_version="3.10")
|
36 |
.apt_install("ffmpeg")
|
37 |
.pip_install(
|
|
|
44 |
"torchvision",
|
45 |
"torchaudio",
|
46 |
"fastapi[standard]", # For web endpoints
|
47 |
+
"pydantic",
|
48 |
+
"yt-dlp", # For request body validation
|
49 |
+
"httpx", # For downloading video from URL
|
50 |
+
"cowsay==6.1" # Cache-busting package
|
51 |
)
|
52 |
)
|
53 |
|
|
|
56 |
|
57 |
# --- Pydantic model for web endpoint request ---
|
58 |
class VideoAnalysisRequestPayload(BaseModel):
|
59 |
+
video_url: Optional[str] = None
|
60 |
+
|
61 |
+
class TopicAnalysisRequest(BaseModel):
|
62 |
+
topic: str
|
63 |
+
max_videos: int = Query(3, ge=1, le=10) # Default 3, min 1, max 10 videos
|
64 |
|
65 |
# --- Constants for Model Names ---
|
66 |
# WHISPER_MODEL_NAME = "openai/whisper-large-v3"
|
67 |
+
CAPTION_MODEL_NAME = "Neleac/SpaceTimeGPT"
|
68 |
+
CAPTION_PROCESSOR_NAME = "Neleac/SpaceTimeGPT" # Use processor from SpaceTimeGPT itself
|
69 |
# # CAPTION_TOKENIZER_NAME = "gpt2" # For SpaceTimeGPT's text decoder (usually part of processor)
|
70 |
# ACTION_MODEL_NAME = "MCG-NJU/videomae-base-finetuned-kinetics"
|
71 |
# ACTION_PROCESSOR_NAME = "MCG-NJU/videomae-base" # Or VideoMAEImageProcessor.from_pretrained(ACTION_MODEL_NAME)
|
|
|
97 |
|
98 |
# === 1. Transcription with Whisper ===
|
99 |
@app.function(
|
100 |
+
image=video_analysis_image_v2,
|
101 |
secrets=[HF_TOKEN_SECRET],
|
102 |
gpu="any",
|
103 |
timeout=600
|
|
|
151 |
device="cuda:0" if torch.cuda.is_available() else "cpu",
|
152 |
)
|
153 |
print(f"[Whisper] Pipeline loaded. Transcribing {temp_audio_path}...")
|
154 |
+
# Add robust error handling for the Whisper model
|
155 |
+
try:
|
156 |
+
outputs = pipe(temp_audio_path, chunk_length_s=30, stride_length_s=5, batch_size=8, generate_kwargs={"language": "english"}, return_timestamps=False)
|
157 |
+
except Exception as whisper_err:
|
158 |
+
print(f"[Whisper] Error during transcription: {whisper_err}")
|
159 |
+
# Try again with different settings if the first attempt failed
|
160 |
+
print(f"[Whisper] Attempting fallback transcription with smaller chunk size...")
|
161 |
+
outputs = pipe(temp_audio_path, chunk_length_s=10, stride_length_s=2, batch_size=4, generate_kwargs={"language": "english"}, return_timestamps=False)
|
162 |
transcription = outputs["text"]
|
163 |
print(f"[Whisper] Transcription successful: {transcription[:100]}...")
|
164 |
return transcription
|
|
|
175 |
|
176 |
# === 2. Captioning with SpaceTimeGPT ===
|
177 |
@app.function(
|
178 |
+
image=video_analysis_image_v2,
|
179 |
secrets=[HF_TOKEN_SECRET],
|
180 |
gpu="any",
|
181 |
timeout=600
|
|
|
183 |
def generate_captions_with_spacetimegpt(video_bytes: bytes) -> str:
|
184 |
_login_to_hf()
|
185 |
import torch
|
186 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
187 |
import av
|
188 |
import numpy as np
|
189 |
import tempfile
|
|
|
207 |
indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
|
208 |
frames = []
|
209 |
for i in indices:
|
210 |
+
container.seek(int(i), stream=video_stream)
|
211 |
frame = next(container.decode(video_stream))
|
212 |
frames.append(frame.to_rgb().to_ndarray())
|
213 |
container.close()
|
214 |
video_frames_np = np.stack(frames)
|
215 |
|
216 |
processor = AutoProcessor.from_pretrained(CAPTION_PROCESSOR_NAME, trust_remote_code=True)
|
217 |
+
|
218 |
+
# Debug prints
|
219 |
+
print(f"[SpaceTimeGPT] DEBUG: CAPTION_MODEL_NAME is {CAPTION_MODEL_NAME}")
|
220 |
+
print(f"[SpaceTimeGPT] DEBUG: Intending to use model class: {AutoModelForVision2Seq.__name__}")
|
221 |
+
print(f"[SpaceTimeGPT] DEBUG: Type of model class object: {type(AutoModelForVision2Seq)}")
|
222 |
+
|
223 |
+
model = AutoModelForVision2Seq.from_pretrained(CAPTION_MODEL_NAME, trust_remote_code=True)
|
224 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
225 |
model.to(device)
|
226 |
if hasattr(processor, 'tokenizer'): # Check if tokenizer exists
|
|
|
246 |
|
247 |
# === 3. Action Recognition with VideoMAE ===
|
248 |
@app.function(
|
249 |
+
image=video_analysis_image_v2,
|
250 |
secrets=[HF_TOKEN_SECRET],
|
251 |
gpu="any",
|
252 |
timeout=600
|
|
|
278 |
indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
|
279 |
video_frames_list = []
|
280 |
for i in indices:
|
281 |
+
container.seek(int(i), stream=video_stream)
|
282 |
frame = next(container.decode(video_stream))
|
283 |
video_frames_list.append(frame.to_rgb().to_ndarray())
|
284 |
container.close()
|
|
|
319 |
|
320 |
# === 4. Object Detection with DETR ===
|
321 |
@app.function(
|
322 |
+
image=video_analysis_image_v2,
|
323 |
secrets=[HF_TOKEN_SECRET],
|
324 |
gpu="any",
|
325 |
timeout=600
|
|
|
359 |
|
360 |
all_frame_detections = []
|
361 |
for frame_num, target_frame_index in enumerate(frame_indices):
|
362 |
+
container.seek(int(target_frame_index), stream=video_stream)
|
363 |
frame = next(container.decode(video_stream))
|
364 |
pil_image = frame.to_image()
|
365 |
|
|
|
398 |
|
399 |
# === 5. Comprehensive Video Analysis (Orchestrator) ===
|
400 |
@app.function(
|
401 |
+
image=video_analysis_image_v2,
|
402 |
secrets=[HF_TOKEN_SECRET],
|
403 |
gpu="any", # Request GPU as some sub-tasks will need it
|
404 |
timeout=1800, # Generous timeout for all models
|
|
|
410 |
cache_key = hashlib.sha256(video_bytes).hexdigest()
|
411 |
|
412 |
try:
|
413 |
+
cached_result = video_analysis_cache.get(cache_key)
|
414 |
if cached_result:
|
415 |
print(f"[Orchestrator] Cache hit for key: {cache_key}")
|
416 |
return cached_result
|
|
|
424 |
print("[Orchestrator] Calling transcription...")
|
425 |
try:
|
426 |
# .call() is synchronous in the context of the Modal function execution
|
427 |
+
results["transcription"] = transcribe_video_with_whisper.remote(video_bytes)
|
428 |
except Exception as e:
|
429 |
print(f"[Orchestrator] Error in transcription: {e}")
|
430 |
results["transcription"] = f"Transcription Error: {str(e)}"
|
431 |
|
432 |
print("[Orchestrator] Calling captioning...")
|
433 |
try:
|
434 |
+
results["caption"] = generate_captions_with_spacetimegpt.remote(video_bytes)
|
435 |
except Exception as e:
|
436 |
print(f"[Orchestrator] Error in captioning: {e}")
|
437 |
results["caption"] = f"Captioning Error: {str(e)}"
|
438 |
|
439 |
print("[Orchestrator] Calling action recognition...")
|
440 |
try:
|
441 |
+
results["actions"] = generate_action_labels.remote(video_bytes)
|
442 |
except Exception as e:
|
443 |
print(f"[Orchestrator] Error in action recognition: {e}")
|
444 |
results["actions"] = [{"error": f"Action Recognition Error: {str(e)}"}] # Ensure list type for error
|
445 |
|
446 |
print("[Orchestrator] Calling object detection...")
|
447 |
try:
|
448 |
+
results["objects"] = generate_object_detection.remote(video_bytes)
|
449 |
except Exception as e:
|
450 |
print(f"[Orchestrator] Error in object detection: {e}")
|
451 |
results["objects"] = [{"error": f"Object Detection Error: {str(e)}"}] # Ensure list type for error
|
452 |
|
453 |
print("[Orchestrator] All analyses attempted. Storing results in cache.")
|
454 |
try:
|
455 |
+
video_analysis_cache.put(cache_key, results)
|
456 |
print(f"[Orchestrator] Successfully cached results for key: {cache_key}")
|
457 |
except Exception as e:
|
458 |
print(f"[Orchestrator] Cache PUT error: {e}")
|
|
|
461 |
|
462 |
|
463 |
# === FastAPI Endpoint for Video Analysis ===
|
464 |
+
@web_app.post("/process_video_analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
def process_video_analysis(payload: VideoAnalysisRequestPayload):
|
466 |
"""FastAPI endpoint for comprehensive video analysis."""
|
467 |
print(f"[FastAPI Endpoint] Received request for video analysis")
|
|
|
472 |
|
473 |
print(f"[FastAPI Endpoint] Processing video_url: {video_url}")
|
474 |
try:
|
475 |
+
# Download video using yt-dlp with enhanced options for robustness
|
476 |
+
import yt_dlp
|
477 |
+
import tempfile
|
478 |
+
import os
|
479 |
+
import subprocess
|
480 |
+
import shutil
|
481 |
+
|
482 |
+
video_bytes = None
|
483 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
484 |
+
output_base = os.path.join(tmpdir, 'video')
|
485 |
+
output_path = output_base + '.mp4'
|
486 |
+
|
487 |
+
# Enhanced yt-dlp options for more reliable downloads
|
488 |
+
ydl_opts = {
|
489 |
+
# Request specific formats in priority order
|
490 |
+
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
491 |
+
'outtmpl': output_base,
|
492 |
+
'quiet': False, # Temporarily enable output for debugging
|
493 |
+
'verbose': True, # More verbose output to diagnose issues
|
494 |
+
'no_warnings': False, # Show warnings for debugging
|
495 |
+
'noplaylist': True,
|
496 |
+
# Force remux to ensure valid container
|
497 |
+
'merge_output_format': 'mp4',
|
498 |
+
# Add postprocessors to ensure valid MP4
|
499 |
+
'postprocessors': [{
|
500 |
+
'key': 'FFmpegVideoConvertor',
|
501 |
+
'preferedformat': 'mp4',
|
502 |
+
'postprocessor_args': ['-movflags', '+faststart'],
|
503 |
+
}],
|
504 |
+
# Force ffmpeg to create a valid MP4 with moov atom at the beginning
|
505 |
+
'prefer_ffmpeg': True,
|
506 |
+
'http_headers': {
|
507 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
508 |
+
},
|
509 |
+
}
|
510 |
+
|
511 |
+
try:
|
512 |
+
print(f"[FastAPI Endpoint] Downloading video with enhanced yt-dlp options from {video_url}")
|
513 |
+
download_success = False
|
514 |
+
|
515 |
+
# Try yt-dlp first
|
516 |
+
try:
|
517 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
518 |
+
ydl.download([video_url])
|
519 |
+
|
520 |
+
# Find the actual output file (might have a different extension)
|
521 |
+
downloaded_files = [f for f in os.listdir(tmpdir) if f.startswith('video')]
|
522 |
+
if downloaded_files:
|
523 |
+
actual_file = os.path.join(tmpdir, downloaded_files[0])
|
524 |
+
print(f"[FastAPI Endpoint] Found downloaded file: {actual_file}")
|
525 |
+
download_success = True
|
526 |
+
except Exception as e:
|
527 |
+
print(f"[FastAPI Endpoint] yt-dlp download failed: {e}. Trying direct download...")
|
528 |
+
|
529 |
+
# Fallback to direct download if it's a direct video URL
|
530 |
+
if not download_success and (video_url.endswith('.mp4') or 'commondatastorage.googleapis.com' in video_url):
|
531 |
+
import requests
|
532 |
+
try:
|
533 |
+
print(f"[FastAPI Endpoint] Attempting direct download for {video_url}")
|
534 |
+
actual_file = os.path.join(tmpdir, 'direct_video.mp4')
|
535 |
+
with requests.get(video_url, stream=True) as r:
|
536 |
+
r.raise_for_status()
|
537 |
+
with open(actual_file, 'wb') as f:
|
538 |
+
for chunk in r.iter_content(chunk_size=8192):
|
539 |
+
f.write(chunk)
|
540 |
+
print(f"[FastAPI Endpoint] Direct download successful: {actual_file}")
|
541 |
+
download_success = True
|
542 |
+
except Exception as e:
|
543 |
+
print(f"[FastAPI Endpoint] Direct download failed: {e}")
|
544 |
+
|
545 |
+
# For testing: Try a sample video if all downloads failed (Big Buck Bunny)
|
546 |
+
if not download_success:
|
547 |
+
test_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
|
548 |
+
print(f"[FastAPI Endpoint] All downloads failed. Falling back to sample video: {test_url}")
|
549 |
+
import requests
|
550 |
+
try:
|
551 |
+
actual_file = os.path.join(tmpdir, 'fallback_video.mp4')
|
552 |
+
with requests.get(test_url, stream=True) as r:
|
553 |
+
r.raise_for_status()
|
554 |
+
with open(actual_file, 'wb') as f:
|
555 |
+
for chunk in r.iter_content(chunk_size=8192):
|
556 |
+
f.write(chunk)
|
557 |
+
print(f"[FastAPI Endpoint] Fallback download successful")
|
558 |
+
download_success = True
|
559 |
+
except Exception as e:
|
560 |
+
print(f"[FastAPI Endpoint] Even fallback download failed: {e}")
|
561 |
+
raise Exception("All download methods failed")
|
562 |
+
|
563 |
+
# Ensure it's a properly formatted MP4 using ffmpeg directly
|
564 |
+
final_output = os.path.join(tmpdir, 'final_video.mp4')
|
565 |
+
try:
|
566 |
+
# Use ffmpeg to re-encode the file, ensuring proper moov atom placement
|
567 |
+
print(f"[FastAPI Endpoint] Reprocessing with ffmpeg to ensure valid MP4 format")
|
568 |
+
subprocess.run(
|
569 |
+
["ffmpeg", "-i", actual_file, "-c:v", "copy", "-c:a", "copy", "-movflags", "faststart", final_output],
|
570 |
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
571 |
+
)
|
572 |
+
|
573 |
+
if os.path.exists(final_output) and os.path.getsize(final_output) > 0:
|
574 |
+
with open(final_output, 'rb') as f:
|
575 |
+
video_bytes = f.read()
|
576 |
+
print(f"[FastAPI Endpoint] Successfully reprocessed video, size: {len(video_bytes)} bytes")
|
577 |
+
else:
|
578 |
+
print(f"[FastAPI Endpoint] ffmpeg reprocessing failed to produce valid output")
|
579 |
+
except subprocess.SubprocessError as se:
|
580 |
+
print(f"[FastAPI Endpoint] ffmpeg reprocessing failed: {se}")
|
581 |
+
# If ffmpeg fails, try with the original file
|
582 |
+
if os.path.exists(actual_file) and os.path.getsize(actual_file) > 0:
|
583 |
+
with open(actual_file, 'rb') as f:
|
584 |
+
video_bytes = f.read()
|
585 |
+
print(f"[FastAPI Endpoint] Using original download, size: {len(video_bytes)} bytes")
|
586 |
+
else:
|
587 |
+
print(f"[FastAPI Endpoint] No downloaded files found in directory: {os.listdir(tmpdir)}")
|
588 |
+
except yt_dlp.utils.DownloadError:
|
589 |
+
# Fallback to httpx for direct links if yt-dlp fails
|
590 |
+
print(f"[FastAPI Endpoint] yt-dlp failed, falling back to httpx for {video_url}")
|
591 |
+
try:
|
592 |
+
import httpx
|
593 |
+
with httpx.Client() as client:
|
594 |
+
response = client.get(video_url, follow_redirects=True, timeout=60.0)
|
595 |
+
response.raise_for_status()
|
596 |
+
video_bytes = response.content
|
597 |
+
except httpx.RequestError as he:
|
598 |
+
return JSONResponse(status_code=400, content={"error": f"Failed to download video from URL using both yt-dlp and httpx. Details: {he}"})
|
599 |
+
|
600 |
+
if not video_bytes:
|
601 |
+
return JSONResponse(status_code=400, content={"error": f"Downloaded video from URL {video_url} is empty or download failed."})
|
602 |
+
|
603 |
+
print(f"[FastAPI Endpoint] Successfully downloaded and validated {len(video_bytes)} bytes from {video_url} using enhanced downloader.")
|
604 |
|
605 |
# Call comprehensive analysis
|
606 |
+
analysis_results = analyze_video_comprehensive.remote(video_bytes)
|
607 |
print("[FastAPI Endpoint] Comprehensive analysis finished.")
|
608 |
return JSONResponse(status_code=200, content=analysis_results)
|
609 |
|
|
|
614 |
print(f"[FastAPI Endpoint] Unexpected Exception during analysis: {e}")
|
615 |
return JSONResponse(status_code=500, content={"error": f"Unexpected server error during analysis: {str(e)}"})
|
616 |
|
617 |
+
# === FastAPI Endpoint for Topic Analysis ===
|
618 |
+
@web_app.post("/analyze_topic")
|
619 |
+
async def handle_analyze_topic_request(request: TopicAnalysisRequest):
|
620 |
+
"""
|
621 |
+
Handles a request to analyze videos based on a topic.
|
622 |
+
1. Finds video URLs for the topic using YouTube search.
|
623 |
+
2. Concurrently analyzes these videos.
|
624 |
+
3. Returns aggregated results.
|
625 |
+
"""
|
626 |
+
print(f"[TopicAPI] Received request to analyze topic: '{request.topic}', max_videos: {request.max_videos}")
|
627 |
+
|
628 |
+
try:
|
629 |
+
# Use .aio for async call if the Modal function is async, or just .remote if it's sync
|
630 |
+
# Assuming find_video_urls_for_topic is sync as defined, but can be called with .remote()
|
631 |
+
# If find_video_urls_for_topic itself becomes async, then .remote.aio() is appropriate.
|
632 |
+
# For now, let's assume it's called as a standard remote Modal function.
|
633 |
+
video_urls = await find_video_urls_for_topic.remote.aio(request.topic, request.max_videos)
|
634 |
+
|
635 |
+
if not video_urls:
|
636 |
+
print(f"[TopicAPI] No video URLs found for topic: '{request.topic}'")
|
637 |
+
return JSONResponse(
|
638 |
+
status_code=404,
|
639 |
+
content={
|
640 |
+
"status": "error",
|
641 |
+
"message": "No videos found for the specified topic.",
|
642 |
+
"topic": request.topic,
|
643 |
+
"details": "The YouTube search did not return any relevant video URLs."
|
644 |
+
}
|
645 |
+
)
|
646 |
+
|
647 |
+
print(f"[TopicAPI] Found {len(video_urls)} URLs for topic '{request.topic}', proceeding to analysis.")
|
648 |
+
|
649 |
+
# analyze_videos_by_topic is an async Modal function, so use .remote.aio()
|
650 |
+
analysis_results = await analyze_videos_by_topic.remote.aio(video_urls, request.topic)
|
651 |
+
|
652 |
+
print(f"[TopicAPI] Successfully analyzed videos for topic: '{request.topic}'")
|
653 |
+
return analysis_results
|
654 |
+
|
655 |
+
except Exception as e:
|
656 |
+
print(f"[TopicAPI] Error during topic analysis for '{request.topic}': {e}")
|
657 |
+
import traceback
|
658 |
+
traceback.print_exc()
|
659 |
+
return JSONResponse(
|
660 |
+
status_code=500,
|
661 |
+
content={
|
662 |
+
"status": "error",
|
663 |
+
"message": "An internal server error occurred during topic analysis.",
|
664 |
+
"topic": request.topic,
|
665 |
+
"error_details_str": str(e) # Keep it simple for JSON
|
666 |
+
}
|
667 |
+
)
|
668 |
+
|
669 |
# === 6. Topic-Based Video Search ===
|
670 |
@app.function(
|
671 |
+
image=video_analysis_image_v2,
|
672 |
secrets=[HF_TOKEN_SECRET],
|
673 |
timeout=300
|
674 |
)
|
675 |
def find_video_urls_for_topic(topic: str, max_results: int = 3) -> List[str]:
|
676 |
+
"""Finds video URLs (YouTube) for a given topic using yt-dlp."""
|
677 |
print(f"[TopicSearch] Finding video URLs for topic: '{topic}', max_results={max_results}")
|
678 |
+
video_urls = []
|
679 |
+
try:
|
680 |
+
# Add a common user-agent to avoid getting blocked
|
681 |
+
# Let yt-dlp find ffmpeg in the PATH instead of hardcoding it
|
682 |
+
ydl_opts = {
|
683 |
+
'quiet': True,
|
684 |
+
'extract_flat': 'discard_in_playlist',
|
685 |
+
'force_generic_extractor': False,
|
686 |
+
'default_search': f"ytsearch{max_results}",
|
687 |
+
'noplaylist': True,
|
688 |
+
'prefer_ffmpeg': True,
|
689 |
+
'http_headers': {
|
690 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
691 |
+
}
|
692 |
+
}
|
693 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
694 |
+
# extract_info with a search query like 'ytsearchN:query' returns a playlist dictionary
|
695 |
+
search_result = ydl.extract_info(topic, download=False)
|
696 |
+
if search_result and 'entries' in search_result:
|
697 |
+
for entry in search_result['entries']:
|
698 |
+
# Ensure entry is a dictionary and has 'webpage_url'
|
699 |
+
if isinstance(entry, dict) and entry.get('webpage_url'):
|
700 |
+
video_urls.append(entry['webpage_url'])
|
701 |
+
# yt-dlp search might return more than max_results, so we cap it here
|
702 |
+
if len(video_urls) >= max_results:
|
703 |
+
break
|
704 |
+
# Sometimes a single video result might not be in 'entries'
|
705 |
+
elif isinstance(search_result, dict) and search_result.get('webpage_url'):
|
706 |
+
video_urls.append(search_result['webpage_url'])
|
707 |
+
|
708 |
+
# Ensure we don't exceed max_results if the loop didn't break early enough
|
709 |
+
video_urls = video_urls[:max_results]
|
710 |
+
print(f"[TopicSearch] Found {len(video_urls)} video URLs for topic '{topic}': {video_urls}")
|
711 |
+
except Exception as e:
|
712 |
+
print(f"[TopicSearch] Error finding videos for topic '{topic}': {e}")
|
713 |
+
import traceback
|
714 |
+
traceback.print_exc()
|
715 |
+
return video_urls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
|
717 |
# Helper function (not a Modal function) to extract video URLs from search results
|
718 |
def extract_video_urls_from_search(search_results: List[Dict[str, str]], max_urls: int = 3) -> List[str]:
|
|
|
753 |
break
|
754 |
if len(video_urls) >= max_urls:
|
755 |
break
|
|
|
|
|
|
|
|
|
|
|
756 |
# === 7. Topic-Based Video Analysis Orchestrator ===
|
757 |
@app.function(
|
758 |
+
image=video_analysis_image_v2,
|
759 |
secrets=[HF_TOKEN_SECRET],
|
760 |
+
gpu="any",
|
761 |
+
timeout=3600
|
762 |
)
|
763 |
+
async def analyze_videos_by_topic(video_urls: list, topic: str) -> dict:
|
764 |
+
# Analyze videos concurrently
|
|
|
|
|
|
|
|
|
|
|
765 |
response.raise_for_status() # Raise HTTPError for bad responses (4XX or 5XX)
|
766 |
video_bytes = await response.aread()
|
767 |
print(f"[TopicAnalysisWorker] Downloaded {len(video_bytes)} bytes from {video_url}")
|