Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import torch | |
| import json | |
| import os | |
| from pathlib import Path | |
| class VideoRetrieval: | |
| def __init__(self, use_dummy_data=True): | |
| self.text_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| if use_dummy_data: | |
| self.create_dummy_data() | |
| else: | |
| self.load_data() | |
| def create_dummy_data(self): | |
| """Create dummy features and metadata for demonstration""" | |
| # Create dummy features | |
| n_clips = 20 | |
| feature_dim = 384 # matching the dimension of all-MiniLM-L6-v2 | |
| self.features = { | |
| 'visual_features': np.random.randn(n_clips, feature_dim), | |
| 'scene_features': np.random.randn(n_clips, feature_dim), | |
| 'object_features': np.random.randn(n_clips, feature_dim) | |
| } | |
| # Create dummy metadata | |
| movie_titles = [ | |
| "The Matrix", "Inception", "The Dark Knight", "Pulp Fiction", | |
| "The Shawshank Redemption", "Forrest Gump", "The Godfather", | |
| "Fight Club", "Interstellar", "The Silence of the Lambs" | |
| ] | |
| descriptions = [ | |
| "A dramatic confrontation in a dark room where the truth is revealed", | |
| "A high-stakes chase through a crowded city street", | |
| "An emotional reunion between long-lost friends", | |
| "A tense negotiation that determines the fate of many", | |
| "A quiet moment of reflection before a life-changing decision" | |
| ] | |
| # Sample YouTube clips (famous movie scenes) | |
| youtube_clips = [ | |
| "https://www.youtube.com/watch?v=kcsNbQRU5TI", # Matrix - Red Pill Blue Pill | |
| "https://www.youtube.com/watch?v=YoHD9XEInc0", # Inception - Hallway Fight | |
| "https://www.youtube.com/watch?v=ZWCAf-xLV2k", # Dark Knight - Interrogation | |
| "https://www.youtube.com/watch?v=Jomr9SAjcyw", # Pulp Fiction - Restaurant | |
| "https://www.youtube.com/watch?v=SQ7_5MMbPYs", # Shawshank - Hope Speech | |
| ] | |
| data = [] | |
| for i in range(n_clips): | |
| data.append({ | |
| 'clip_id': f'clip_{i}', | |
| 'movie_title': movie_titles[i % len(movie_titles)], | |
| 'description': descriptions[i % len(descriptions)], | |
| 'timestamp': f'{(i*5):02d}:00 - {(i*5+3):02d}:00', | |
| 'duration': '3:00', | |
| 'youtube_url': youtube_clips[i % len(youtube_clips)] | |
| }) | |
| self.clips_df = pd.DataFrame(data) | |
| def load_data(self): | |
| """Load actual pre-computed features and metadata""" | |
| try: | |
| self.features = { | |
| 'visual_features': np.load('path_to_visual_features.npy'), | |
| 'scene_features': np.load('path_to_scene_features.npy'), | |
| 'object_features': np.load('path_to_object_features.npy') | |
| } | |
| self.clips_df = pd.read_csv('clips_metadata.csv') | |
| except FileNotFoundError as e: | |
| st.error(f"Error loading data: {e}. Falling back to dummy data.") | |
| self.create_dummy_data() | |
| def encode_query(self, query_text): | |
| """Encode the text query into embeddings""" | |
| return self.text_model.encode(query_text) | |
| def compute_similarity(self, query_embedding, feature_type='visual_features'): | |
| """Compute similarity between query and video features""" | |
| similarities = cosine_similarity( | |
| query_embedding.reshape(1, -1), | |
| self.features[feature_type] | |
| ) | |
| return similarities[0] | |
| def retrieve_clips(self, query_text, top_k=3): | |
| """Retrieve top-k most relevant clips based on query""" | |
| # Encode query | |
| query_embedding = self.encode_query(query_text) | |
| # Compute similarities for different feature types | |
| similarities = {} | |
| weights = { | |
| 'visual_features': 0.4, | |
| 'scene_features': 0.3, | |
| 'object_features': 0.3 | |
| } | |
| for feat_type, weight in weights.items(): | |
| similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight | |
| # Combine similarities | |
| combined_similarities = sum(similarities.values()) | |
| # Get top-k indices | |
| top_indices = np.argsort(combined_similarities)[-top_k:][::-1] | |
| # Return clip information | |
| results = [] | |
| for idx in top_indices: | |
| results.append({ | |
| 'clip_id': self.clips_df.iloc[idx]['clip_id'], | |
| 'movie_title': self.clips_df.iloc[idx]['movie_title'], | |
| 'description': self.clips_df.iloc[idx]['description'], | |
| 'timestamp': self.clips_df.iloc[idx]['timestamp'], | |
| 'youtube_url': self.clips_df.iloc[idx]['youtube_url'], | |
| 'similarity_score': float(combined_similarities[idx]) # Convert to float for JSON serialization | |
| }) | |
| return results | |
| def main(): | |
| st.set_page_config( | |
| page_title="Movie Scene Retrieval System", | |
| page_icon="π¬", | |
| layout="wide" | |
| ) | |
| st.title("π¬ Movie Scene Retrieval System") | |
| st.write(""" | |
| Search for movie scenes using natural language descriptions. | |
| The system will retrieve the most relevant 2-3 minute clips based on your query. | |
| *Note: This is a demo version using simulated data.* | |
| """) | |
| # Initialize retrieval system | |
| try: | |
| retrieval_system = st.session_state.retrieval_system | |
| except AttributeError: | |
| retrieval_system = VideoRetrieval(use_dummy_data=True) | |
| st.session_state.retrieval_system = retrieval_system | |
| # Search interface | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| query = st.text_input( | |
| "Enter your scene description:", | |
| placeholder="e.g., A dramatic confrontation between two characters in a dark room" | |
| ) | |
| with col2: | |
| num_results = st.slider("Number of results:", min_value=1, max_value=5, value=3) | |
| if st.button("π Search", type="primary"): | |
| if not query: | |
| st.warning("Please enter a scene description.") | |
| else: | |
| with st.spinner("Searching for relevant clips..."): | |
| results = retrieval_system.retrieve_clips(query, top_k=num_results) | |
| for i, result in enumerate(results, 1): | |
| with st.container(): | |
| st.subheader(f"{result['movie_title']}") | |
| cols = st.columns([2, 1]) | |
| with cols[0]: | |
| st.markdown(f"**Scene Description:**") | |
| st.write(result['description']) | |
| st.text(f"β±οΈ Timestamp: {result['timestamp']}") | |
| # Add video player | |
| if result['youtube_url']: | |
| st.video(result['youtube_url']) | |
| with cols[1]: | |
| st.markdown("**Relevance Score:**") | |
| score = min(1.0, max(0.0, result['similarity_score'])) | |
| st.progress(score) | |
| st.text(f"{score:.2%} match") | |
| # Add direct YouTube link | |
| st.markdown(f"[π Watch on YouTube]({result['youtube_url']})") | |
| st.text("Click to open in a new tab") | |
| st.divider() | |
| # Sidebar with additional information | |
| with st.sidebar: | |
| st.header("βΉοΈ About") | |
| st.write(""" | |
| This demo system simulates a video retrieval engine that uses: | |
| - π₯ Visual scene understanding | |
| - π₯ Character interaction analysis | |
| - π― Object detection | |
| - π Action recognition | |
| In a production system, these features would be pre-computed | |
| from actual movie clips using state-of-the-art AI models. | |
| """) | |
| st.header("βοΈ Feature Weights") | |
| st.write("Current weights used for similarity computation:") | |
| st.write("- π¬ Visual Features: 40%") | |
| st.write("- ποΈ Scene Features: 30%") | |
| st.write("- π¦ Object Features: 30%") | |
| if __name__ == "__main__": | |
| main() |