Spaces:

Penguni
/

dashboardmovie

Sleeping

App Files Files Community

Penguni commited on Jun 25, 2024

Commit

a57a561

verified ·

1 Parent(s): ce46156

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -106

app.py CHANGED Viewed

@@ -1,18 +1,21 @@
 import sqlite3
 import pandas as pd
 import plotly.express as px
-import plotly.graph_objects as go
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from collections import Counter
 import numpy as np
-import streamlit as st
 # Function to load data from SQLite database
 def load_data(db_file):
     conn = sqlite3.connect(db_file)
     return conn
 # Function to fetch summary info from database
 def fetch_summary_info(conn):
     # Fetch total count of movies
@@ -40,31 +43,6 @@ def fetch_summary_info(conn):
     return total_movies, total_years, avg_rating
-# Function to fetch data from database based on query
-def fetch_data(conn, query):
-    return pd.read_sql_query(query, conn)
-# Function to fetch genre movie releases by year
-def fetch_genre_movie_releases(conn):
-    query = '''
-        SELECT startYear, genres
-        FROM title_basics
-        WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
-    '''
-    df = pd.read_sql_query(query, conn)
-    # Split genres and explode to separate rows
-    df['genres'] = df['genres'].str.split(',')
-    df = df.explode('genres')
-    # Convert startYear to numeric
-    df['startYear'] = pd.to_numeric(df['startYear'])
-    # Group by startYear and genre, count the number of movies
-    genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
-    return genre_counts
 # Function to plot global map of total films per region
 def plot_global_map(df):
     # Country code to name mapping
@@ -106,128 +84,125 @@ def plot_global_map(df):
         'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
         'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
         'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
-        'VE': 'Venezuela', 'VN': 'Viet Nam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
     }
-    # Mapping country codes to names
     df['region'] = df['region'].map(country_mapping)
-    # Count total films per country
-    country_counts = df['region'].value_counts().reset_index(name='total_films')
-    # Plotting with Plotly Express
-    fig = px.choropleth(country_counts, locations='index', locationmode='country names', color='total_films',
-                        hover_name='index', color_continuous_scale='Viridis',
                         title='Global Map of Total Films by Country')
     return fig
-# Function to plot word cloud of top genres
-def plot_word_cloud(genres_list):
-    genre_counts = Counter(genres_list)
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
-    # Plot the word cloud
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
-    plt.title('Top Genres Word Cloud')
-    return plt
-# Function to find best movies by genre
 def find_best_movies_by_genre(conn):
     query = '''
-        SELECT tb.genres, tb.primaryTitle, tr.averageRating
         FROM title_basics tb
         JOIN title_ratings tr ON tb.tconst = tr.tconst
         WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
-        ORDER BY tr.averageRating DESC
     '''
     df = fetch_data(conn, query)
-    # Split genres and keep the top-rated movie for each genre
-    genre_movie_mapping = {}
-    for _, row in df.iterrows():
-        genres = row['genres'].split(',')
-        for genre in genres:
-            if genre not in genre_movie_mapping:
-                genre_movie_mapping[genre] = (row['primaryTitle'], row['averageRating'])
-    # Create a DataFrame for display
-    best_movies_by_genre = pd.DataFrame([
-        {'Genre': genre, 'Movie': movie[0], 'Rating': movie[1]}
-        for genre, movie in genre_movie_mapping.items()
-    ])
     return best_movies_by_genre
-# Streamlit app
 def main():
-    st.title("IMDb Movie Data Analysis")
     # Load data from SQLite database
-    db_file = '/content/imdb_data/imdb_data.db'  # Adjust path as needed
     conn = load_data(db_file)
-    # Fetch summary info
     total_movies, total_years, avg_rating = fetch_summary_info(conn)
-    # Layout for summary info
     col1, col2, col3 = st.columns(3)
     with col1:
         st.subheader("Total Movies")
-        st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_movies}</h1>", unsafe_allow_html=True)
     with col2:
         st.subheader("Total Years")
-        st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_years}</h1>", unsafe_allow_html=True)
     with col3:
         st.subheader("Average Rating")
-        st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{avg_rating:.2f}</h1>", unsafe_allow_html=True)
-    # Fetch best movies by genre
-    best_movies_by_genre = find_best_movies_by_genre(conn)
-    # Layout for table, global map, and word cloud
-    st.markdown("---")
-    row1_col1, row1_col2, row1_col3 = st.columns([1, 1, 1])
-    with row1_col1:
         st.subheader("Best Movie of Each Genre")
         st.write(best_movies_by_genre)
-    with row1_col2:
         st.subheader("Global Map of Total Films by Country")
-        query_country_distribution = '''
-            SELECT region
-            FROM title_akas
-        '''
-        country_distribution = fetch_data(conn, query_country_distribution)
-        fig = plot_global_map(country_distribution)
-        st.plotly_chart(fig)
-    with row1_col3:
         st.subheader("Word Cloud of Top Genres")
-        query_genres = '''
-            SELECT genres
-            FROM title_basics
-            WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
-        '''
-        genres_list = fetch_data(conn, query_genres)['genres'].str.split(',').explode().tolist()
-        plt = plot_word_cloud(genres_list)
-        st.pyplot(plt)
-    # Fetch genre movie releases by year
-    genre_counts = fetch_genre_movie_releases(conn)
-    # Plot line chart using Plotly Express
-    st.markdown("---")
-    st.subheader("Genre Movie Releases by Year")
-    fig_genre_counts = px.line(genre_counts, x='startYear', y='count', color='genres',
-                               title='Genre Movie Releases by Year',
-                               labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
-    fig_genre_counts.update_layout(xaxis_tickmode='linear')  # Ensure x-axis ticks are shown in a linear manner
-    fig_genre_counts.update_xaxes(range=[2000, 2025])
-    st.plotly_chart(fig_genre_counts)
     # Close database connection
     conn.close()
 if __name__ == '__main__':
-    main()

 import sqlite3
 import pandas as pd
 import plotly.express as px
+import streamlit as st
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from collections import Counter
 import numpy as np
 # Function to load data from SQLite database
 def load_data(db_file):
     conn = sqlite3.connect(db_file)
     return conn
+# Function to fetch data from database based on query
+def fetch_data(conn, query):
+    return pd.read_sql_query(query, conn)
 # Function to fetch summary info from database
 def fetch_summary_info(conn):
     # Fetch total count of movies
     return total_movies, total_years, avg_rating
 # Function to plot global map of total films per region
 def plot_global_map(df):
     # Country code to name mapping
         'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
         'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
         'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
+        'VE': 'Venezuela, Bolivarian Republic of', 'VN': 'Viet Nam', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
     }
+    # Map country codes to country names
     df['region'] = df['region'].map(country_mapping)
+    # Group by country and count the number of films
+    df_grouped = df.groupby('region').size().reset_index(name='total_films')
+    # Apply log transformation to handle outliers
+    df_grouped['log_total_films'] = np.log10(df_grouped['total_films'] + 1)
+    # Plotting the global map
+    fig = px.choropleth(df_grouped,
+                        locations='region',
+                        locationmode='country names',
+                        color='log_total_films',
+                        hover_name='region',
+                        color_continuous_scale=px.colors.sequential.Plasma,
+                        labels={'log_total_films': 'Log Total Films'},
                         title='Global Map of Total Films by Country')
     return fig
+# Function to create word cloud of genres
+def create_genre_wordcloud(conn):
+    query = '''
+        SELECT genres
+        FROM title_basics
+        WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
+    '''
+    df = fetch_data(conn, query)
+    # Process genres
+    genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
+    genre_counts = Counter(genres)
+    # Generate the word cloud
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
+    # Display the word cloud
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
+    plt.title('Top Genres in IMDb Dataset')
+    st.pyplot(plt.gcf())  # Pass the current figure explicitly to st.pyplot()
+# Function to find best movie of each genre by numVotes * averageRating
 def find_best_movies_by_genre(conn):
     query = '''
+        SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
         FROM title_basics tb
         JOIN title_ratings tr ON tb.tconst = tr.tconst
         WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
     '''
     df = fetch_data(conn, query)
+    # Split genres and select the first genre for each movie
+    df['genre'] = df['genres'].str.split(',', expand=True)[0]
+    # Calculate score based on numVotes * averageRating
+    df['score'] = df['numVotes'] * df['averageRating']
+    # Get the best movie (highest score) for each genre
+    idx = df.groupby('genre')['score'].idxmax()
+    best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
+        .sort_values(by='score', ascending=False).reset_index(drop=True)
     return best_movies_by_genre
+# Main function to orchestrate the dashboard
 def main():
     # Load data from SQLite database
+    db_file = 'imdb_data/imdb_data.db'  # Adjust path as needed
     conn = load_data(db_file)
+    # Fetch and display summary info
     total_movies, total_years, avg_rating = fetch_summary_info(conn)
+    # Display summary information in three columns with bold outline
+    st.markdown("<h1 style='text-align: center; font-size: 24px; border: 2px solid black; padding: 10px;'>IMDb Dashboard</h1>", unsafe_allow_html=True)
+    st.markdown("<h2 style='text-align: center; font-size: 20px;'>Summary Information</h2>", unsafe_allow_html=True)
+    # Layout the summary information in three columns with big bold numbers
     col1, col2, col3 = st.columns(3)
     with col1:
         st.subheader("Total Movies")
+        st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_movies}</p>", unsafe_allow_html=True)
     with col2:
         st.subheader("Total Years")
+        st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_years}</p>", unsafe_allow_html=True)
     with col3:
         st.subheader("Average Rating")
+        st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
+    # Create placeholders for the table and visualizations
+    placeholder_table = st.empty()
+    col1, col2 = st.columns(2)
+    # Best Movie Table
+    with placeholder_table.container():
         st.subheader("Best Movie of Each Genre")
+        best_movies_by_genre = find_best_movies_by_genre(conn)
         st.write(best_movies_by_genre)
+    # Visualizations
+    with col1:
         st.subheader("Global Map of Total Films by Country")
+        df_movie_region = pd.read_csv('movie_region.csv')  # Replace with your actual CSV loading
+        fig = plot_global_map(df_movie_region)
+        st.plotly_chart(fig, use_container_width=True)
+    with col2:
         st.subheader("Word Cloud of Top Genres")
+        create_genre_wordcloud(conn)
     # Close database connection
     conn.close()
+# Execute the main function
 if __name__ == '__main__':
+    main()