Spaces:

Penguni
/

dashboardmovie

Sleeping

App Files Files Community

Penguni commited on Jun 25, 2024

Commit

ce46156

verified ·

1 Parent(s): c562e03

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -80

app.py CHANGED Viewed

@@ -1,21 +1,18 @@
 import sqlite3
 import pandas as pd
-import streamlit as st
 import plotly.express as px
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from collections import Counter
 import numpy as np
 # Function to load data from SQLite database
 def load_data(db_file):
     conn = sqlite3.connect(db_file)
     return conn
-# Function to fetch data from database based on query
-def fetch_data(conn, query):
-    return pd.read_sql_query(query, conn)
 # Function to fetch summary info from database
 def fetch_summary_info(conn):
     # Fetch total count of movies
@@ -43,6 +40,31 @@ def fetch_summary_info(conn):
     return total_movies, total_years, avg_rating
 # Function to plot global map of total films per region
 def plot_global_map(df):
     # Country code to name mapping
@@ -84,125 +106,128 @@ def plot_global_map(df):
         'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
         'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
         'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
-        'VE': 'Venezuela, Bolivarian Republic of', 'VN': 'Viet Nam', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
     }
-    # Map country codes to country names
     df['region'] = df['region'].map(country_mapping)
-    # Group by country and count the number of films
-    df_grouped = df.groupby('region').size().reset_index(name='total_films')
-    # Apply log transformation to handle outliers
-    df_grouped['log_total_films'] = np.log10(df_grouped['total_films'] + 1)
-    # Plotting the global map
-    fig = px.choropleth(df_grouped,
-                        locations='region',
-                        locationmode='country names',
-                        color='log_total_films',
-                        hover_name='region',
-                        color_continuous_scale=px.colors.sequential.Plasma,
-                        labels={'log_total_films': 'Log Total Films'},
                         title='Global Map of Total Films by Country')
     return fig
-# Function to create word cloud of genres
-def create_genre_wordcloud(conn):
-    query = '''
-        SELECT genres
-        FROM title_basics
-        WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
-    '''
-    df = fetch_data(conn, query)
-    # Process genres
-    genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
-    genre_counts = Counter(genres)
-    # Generate the word cloud
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
-    # Display the word cloud
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
-    plt.title('Top Genres in IMDb Dataset')
-    st.pyplot(plt.gcf())  # Pass the current figure explicitly to st.pyplot()
-# Function to find best movie of each genre by numVotes * averageRating
 def find_best_movies_by_genre(conn):
     query = '''
-        SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
         FROM title_basics tb
         JOIN title_ratings tr ON tb.tconst = tr.tconst
         WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
     '''
     df = fetch_data(conn, query)
-    # Split genres and select the first genre for each movie
-    df['genre'] = df['genres'].str.split(',', expand=True)[0]
-    # Calculate score based on numVotes * averageRating
-    df['score'] = df['numVotes'] * df['averageRating']
-    # Get the best movie (highest score) for each genre
-    idx = df.groupby('genre')['score'].idxmax()
-    best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
-        .sort_values(by='score', ascending=False).reset_index(drop=True)
     return best_movies_by_genre
-# Main function to orchestrate the dashboard
 def main():
     # Load data from SQLite database
-    db_file = 'imdb_data.db'  # Adjust path as needed
     conn = load_data(db_file)
-    # Fetch and display summary info
     total_movies, total_years, avg_rating = fetch_summary_info(conn)
-    # Display summary information in three columns with bold outline
-    st.markdown("<h1 style='text-align: center; font-size: 24px; border: 2px solid black; padding: 10px;'>IMDb Dashboard</h1>", unsafe_allow_html=True)
-    st.markdown("<h2 style='text-align: center; font-size: 20px;'>Summary Information</h2>", unsafe_allow_html=True)
-    # Layout the summary information in three columns with big bold numbers
     col1, col2, col3 = st.columns(3)
     with col1:
         st.subheader("Total Movies")
-        st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_movies}</p>", unsafe_allow_html=True)
     with col2:
         st.subheader("Total Years")
-        st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_years}</p>", unsafe_allow_html=True)
     with col3:
         st.subheader("Average Rating")
-        st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
-    # Create placeholders for the table and visualizations
-    placeholder_table = st.empty()
-    col1, col2 = st.columns(2)
-    # Best Movie Table
-    with placeholder_table.container():
         st.subheader("Best Movie of Each Genre")
-        best_movies_by_genre = find_best_movies_by_genre(conn)
         st.write(best_movies_by_genre)
-    # Visualizations
-    with col1:
         st.subheader("Global Map of Total Films by Country")
-        df_movie_region = pd.read_csv('movie_region.csv')  # Replace with your actual CSV loading
-        fig = plot_global_map(df_movie_region)
-        st.plotly_chart(fig, use_container_width=True)
-    with col2:
         st.subheader("Word Cloud of Top Genres")
-        create_genre_wordcloud(conn)
     # Close database connection
     conn.close()
-# Execute the main function
 if __name__ == '__main__':
     main()

 import sqlite3
 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from collections import Counter
 import numpy as np
+import streamlit as st
 # Function to load data from SQLite database
 def load_data(db_file):
     conn = sqlite3.connect(db_file)
     return conn
 # Function to fetch summary info from database
 def fetch_summary_info(conn):
     # Fetch total count of movies
     return total_movies, total_years, avg_rating
+# Function to fetch data from database based on query
+def fetch_data(conn, query):
+    return pd.read_sql_query(query, conn)
+# Function to fetch genre movie releases by year
+def fetch_genre_movie_releases(conn):
+    query = '''
+        SELECT startYear, genres
+        FROM title_basics
+        WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
+    '''
+    df = pd.read_sql_query(query, conn)
+    # Split genres and explode to separate rows
+    df['genres'] = df['genres'].str.split(',')
+    df = df.explode('genres')
+    # Convert startYear to numeric
+    df['startYear'] = pd.to_numeric(df['startYear'])
+    # Group by startYear and genre, count the number of movies
+    genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
+    return genre_counts
 # Function to plot global map of total films per region
 def plot_global_map(df):
     # Country code to name mapping
         'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
         'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
         'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
+        'VE': 'Venezuela', 'VN': 'Viet Nam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
     }
+    # Mapping country codes to names
     df['region'] = df['region'].map(country_mapping)
+    # Count total films per country
+    country_counts = df['region'].value_counts().reset_index(name='total_films')
+    # Plotting with Plotly Express
+    fig = px.choropleth(country_counts, locations='index', locationmode='country names', color='total_films',
+                        hover_name='index', color_continuous_scale='Viridis',
                         title='Global Map of Total Films by Country')
     return fig
+# Function to plot word cloud of top genres
+def plot_word_cloud(genres_list):
+    genre_counts = Counter(genres_list)
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
+    # Plot the word cloud
     plt.figure(figsize=(10, 5))
     plt.imshow(wordcloud, interpolation='bilinear')
     plt.axis('off')
+    plt.title('Top Genres Word Cloud')
+    return plt
+# Function to find best movies by genre
 def find_best_movies_by_genre(conn):
     query = '''
+        SELECT tb.genres, tb.primaryTitle, tr.averageRating
         FROM title_basics tb
         JOIN title_ratings tr ON tb.tconst = tr.tconst
         WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
+        ORDER BY tr.averageRating DESC
     '''
     df = fetch_data(conn, query)
+    # Split genres and keep the top-rated movie for each genre
+    genre_movie_mapping = {}
+    for _, row in df.iterrows():
+        genres = row['genres'].split(',')
+        for genre in genres:
+            if genre not in genre_movie_mapping:
+                genre_movie_mapping[genre] = (row['primaryTitle'], row['averageRating'])
+    # Create a DataFrame for display
+    best_movies_by_genre = pd.DataFrame([
+        {'Genre': genre, 'Movie': movie[0], 'Rating': movie[1]}
+        for genre, movie in genre_movie_mapping.items()
+    ])
     return best_movies_by_genre
+# Streamlit app
 def main():
+    st.title("IMDb Movie Data Analysis")
     # Load data from SQLite database
+    db_file = '/content/imdb_data/imdb_data.db'  # Adjust path as needed
     conn = load_data(db_file)
+    # Fetch summary info
     total_movies, total_years, avg_rating = fetch_summary_info(conn)
+    # Layout for summary info
     col1, col2, col3 = st.columns(3)
     with col1:
         st.subheader("Total Movies")
+        st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_movies}</h1>", unsafe_allow_html=True)
     with col2:
         st.subheader("Total Years")
+        st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_years}</h1>", unsafe_allow_html=True)
     with col3:
         st.subheader("Average Rating")
+        st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{avg_rating:.2f}</h1>", unsafe_allow_html=True)
+    # Fetch best movies by genre
+    best_movies_by_genre = find_best_movies_by_genre(conn)
+    # Layout for table, global map, and word cloud
+    st.markdown("---")
+    row1_col1, row1_col2, row1_col3 = st.columns([1, 1, 1])
+    with row1_col1:
         st.subheader("Best Movie of Each Genre")
         st.write(best_movies_by_genre)
+    with row1_col2:
         st.subheader("Global Map of Total Films by Country")
+        query_country_distribution = '''
+            SELECT region
+            FROM title_akas
+        '''
+        country_distribution = fetch_data(conn, query_country_distribution)
+        fig = plot_global_map(country_distribution)
+        st.plotly_chart(fig)
+    with row1_col3:
         st.subheader("Word Cloud of Top Genres")
+        query_genres = '''
+            SELECT genres
+            FROM title_basics
+            WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
+        '''
+        genres_list = fetch_data(conn, query_genres)['genres'].str.split(',').explode().tolist()
+        plt = plot_word_cloud(genres_list)
+        st.pyplot(plt)
+    # Fetch genre movie releases by year
+    genre_counts = fetch_genre_movie_releases(conn)
+    # Plot line chart using Plotly Express
+    st.markdown("---")
+    st.subheader("Genre Movie Releases by Year")
+    fig_genre_counts = px.line(genre_counts, x='startYear', y='count', color='genres',
+                               title='Genre Movie Releases by Year',
+                               labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
+    fig_genre_counts.update_layout(xaxis_tickmode='linear')  # Ensure x-axis ticks are shown in a linear manner
+    fig_genre_counts.update_xaxes(range=[2000, 2025])
+    st.plotly_chart(fig_genre_counts)
     # Close database connection
     conn.close()
 if __name__ == '__main__':
     main()