Spaces:
Sleeping
Sleeping
| import sqlite3 | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import streamlit as st | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| import numpy as np | |
| # Function to load data from SQLite database | |
| def load_data(db_file): | |
| conn = sqlite3.connect(db_file) | |
| return conn | |
| # Function to fetch genre movie releases by year | |
| def fetch_genre_movie_releases(conn): | |
| query = r''' | |
| SELECT startYear, genres | |
| FROM title_basics | |
| WHERE titleType = 'movie' AND startYear != '\N' AND genres != '\N' | |
| ''' | |
| df = pd.read_sql_query(query, conn) | |
| # Split genres and explode to separate rows | |
| df['genres'] = df['genres'].str.split(',') | |
| df = df.explode('genres') | |
| # Convert startYear to numeric | |
| df['startYear'] = pd.to_numeric(df['startYear']) | |
| # Group by startYear and genre, count the number of movies | |
| genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count') | |
| return genre_counts | |
| # Function to fetch data for filled line chart of movie release years | |
| def fetch_movie_release_years(conn): | |
| query_release_years = r''' | |
| SELECT startYear, COUNT(*) as count | |
| FROM title_basics | |
| WHERE titleType = 'movie' AND startYear != '\N' | |
| GROUP BY startYear | |
| ORDER BY startYear | |
| ''' | |
| df_release_years = pd.read_sql_query(query_release_years, conn) | |
| return df_release_years | |
| # Function to fetch data and create box plot of average rating by first_genre | |
| def fetch_and_plot_average_rating_by_genre(conn): | |
| query = r''' | |
| SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres | |
| FROM title_basics tb | |
| JOIN title_ratings tr ON tb.tconst = tr.tconst | |
| WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\N' | |
| ''' | |
| df = pd.read_sql_query(query, conn) | |
| # Function to extract the first genre from the genres list | |
| def extract_first_genre(genres): | |
| if genres: | |
| return genres.split(',')[0].strip() | |
| else: | |
| return None | |
| # Apply the function to extract the first genre | |
| df['first_genre'] = df['genres'].apply(extract_first_genre) | |
| # Drop rows where first_genre is None (shouldn't be necessary if genres column is clean) | |
| df = df.dropna(subset=['first_genre']) | |
| # Create a box plot of average rating by first_genre | |
| fig = px.box(df, x='first_genre', y='averageRating', | |
| labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'}, | |
| title='Average Rating of Movies by First Genre') | |
| return fig | |
| # Function to create word cloud of genres | |
| def create_genre_wordcloud(conn): | |
| query = r''' | |
| SELECT genres | |
| FROM title_basics | |
| WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\N' | |
| ''' | |
| df = pd.read_sql_query(query, conn) | |
| # Process genres | |
| genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True) | |
| genre_counts = Counter(genres) | |
| # Generate the word cloud | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts) | |
| # Display the word cloud | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.title('Top Genres in IMDb Dataset') | |
| st.pyplot(plt.gcf()) # Pass the current figure explicitly to st.pyplot() | |
| # Function to find best movie of each genre by numVotes * averageRating | |
| def find_best_movies_by_genre(conn): | |
| query = r''' | |
| SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes | |
| FROM title_basics tb | |
| JOIN title_ratings tr ON tb.tconst = tr.tconst | |
| WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\N' | |
| ''' | |
| df = pd.read_sql_query(query, conn) | |
| # Split genres and select the first genre for each movie | |
| df['genre'] = df['genres'].str.split(',', expand=True)[0] | |
| # Calculate score based on numVotes * averageRating | |
| df['score'] = df['numVotes'] * df['averageRating'] | |
| # Get the best movie (highest score) for each genre | |
| idx = df.groupby('genre')['score'].idxmax() | |
| best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \ | |
| .sort_values(by='score', ascending=False).reset_index(drop=True) | |
| return best_movies_by_genre | |
| # Function to plot stacked area chart of genre movie releases by year using Plotly Express | |
| def plot_stacked_genre_movie_releases(genre_counts): | |
| fig = px.area(genre_counts, x='startYear', y='count', color='genres', | |
| title='Stacked Genre Movie Releases by Year', | |
| labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'}, | |
| line_group='genres', # This groups lines by genre | |
| hover_name='genres', # This sets the genre as the hover label | |
| hover_data={'count': ':.0f'}) # Format hover data as integer | |
| fig.update_layout(xaxis_tickmode='linear', # Ensure x-axis ticks are shown in a linear manner | |
| xaxis_range=[2000, 2025]) # Adjust x-axis range if needed | |
| return fig | |
| # Function to plot filled line chart of movie release years using Plotly Graph Objects | |
| def plot_movie_release_years(df_release_years): | |
| fig = go.Figure(data=go.Scatter(x=df_release_years['startYear'], y=df_release_years['count'], fill='tozeroy')) | |
| fig.update_layout(title='Movie Release Years', | |
| xaxis_title='Year', | |
| yaxis_title='Number of Movies Released') | |
| return fig | |
| # Function to plot global map of total films per region using Plotly Express | |
| def plot_global_map(): | |
| df = pd.read_csv('movie_region.csv') | |
| # Country code to name mapping | |
| country_mapping = { | |
| 'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa', | |
| 'AD': 'Andorra', 'AO': 'Angola', 'AI': 'Anguilla', 'AQ': 'Antarctica', 'AG': 'Antigua and Barbuda', | |
| 'AR': 'Argentina', 'AM': 'Armenia', 'AW': 'Aruba', 'AU': 'Australia', 'AT': 'Austria', | |
| 'AZ': 'Azerbaijan', 'BS': 'Bahamas', 'BH': 'Bahrain', 'BD': 'Bangladesh', 'BB': 'Barbados', | |
| 'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize', 'BJ': 'Benin', 'BM': 'Bermuda', | |
| 'BT': 'Bhutan', 'BO': 'Bolivia', 'BA': 'Bosnia and Herzegovina', 'BW': 'Botswana', 'BR': 'Brazil', | |
| 'BN': 'Brunei Darussalam', 'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'KH': 'Cambodia', | |
| 'CM': 'Cameroon', 'CA': 'Canada', 'CV': 'Cape Verde', 'KY': 'Cayman Islands', 'CF': 'Central African Republic', | |
| 'TD': 'Chad', 'CL': 'Chile', 'CN': 'China', 'CO': 'Colombia', 'KM': 'Comoros', | |
| 'CG': 'Congo', 'CD': 'Congo, Democratic Republic of the', 'CR': 'Costa Rica', 'HR': 'Croatia', 'CU': 'Cuba', | |
| 'CY': 'Cyprus', 'CZ': 'Czech Republic', 'DK': 'Denmark', 'DJ': 'Djibouti', 'DM': 'Dominica', | |
| 'DO': 'Dominican Republic', 'EC': 'Ecuador', 'EG': 'Egypt', 'SV': 'El Salvador', 'GQ': 'Equatorial Guinea', | |
| 'ER': 'Eritrea', 'EE': 'Estonia', 'ET': 'Ethiopia', 'FJ': 'Fiji', 'FI': 'Finland', | |
| 'FR': 'France', 'GA': 'Gabon', 'GM': 'Gambia', 'GE': 'Georgia', 'DE': 'Germany', | |
| 'GH': 'Ghana', 'GR': 'Greece', 'GD': 'Grenada', 'GT': 'Guatemala', 'GN': 'Guinea', | |
| 'GW': 'Guinea-Bissau', 'GY': 'Guyana', 'HT': 'Haiti', 'HN': 'Honduras', 'HK': 'Hong Kong', | |
| 'HU': 'Hungary', 'IS': 'Iceland', 'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran, Islamic Republic of', | |
| 'IQ': 'Iraq', 'IE': 'Ireland', 'IL': 'Israel', 'IT': 'Italy', 'JM': 'Jamaica', | |
| 'JP': 'Japan', 'JO': 'Jordan', 'KZ': 'Kazakhstan', 'KE': 'Kenya', 'KP': 'Korea, Democratic People\'s Republic of', | |
| 'KR': 'Korea, Republic of', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan', 'LA': 'Lao People\'s Democratic Republic', | |
| 'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia', 'LY': 'Libya', | |
| 'LT': 'Lithuania', 'LU': 'Luxembourg', 'MO': 'Macao', 'MK': 'Macedonia, the Former Yugoslav Republic of', | |
| 'MG': 'Madagascar', 'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali', | |
| 'MT': 'Malta', 'MR': 'Mauritania', 'MU': 'Mauritius', 'MX': 'Mexico', 'MD': 'Moldova, Republic of', | |
| 'MN': 'Mongolia', 'ME': 'Montenegro', 'MA': 'Morocco', 'MZ': 'Mozambique', 'MM': 'Myanmar', | |
| 'NA': 'Namibia', 'NP': 'Nepal', 'NL': 'Netherlands', 'NZ': 'New Zealand', 'NI': 'Nicaragua', | |
| 'NE': 'Niger', 'NG': 'Nigeria', 'NO': 'Norway', 'OM': 'Oman', 'PK': 'Pakistan', | |
| 'PW': 'Palau', 'PA': 'Panama', 'PG': 'Papua New Guinea', 'PY': 'Paraguay', 'PE': 'Peru', | |
| 'PH': 'Philippines', 'PL': 'Poland', 'PT': 'Portugal', 'QA': 'Qatar', 'RO': 'Romania', | |
| 'RU': 'Russian Federation', 'RW': 'Rwanda', 'WS': 'Samoa', 'SA': 'Saudi Arabia', 'SN': 'Senegal', | |
| 'RS': 'Serbia', 'SL': 'Sierra Leone', 'SG': 'Singapore', 'SK': 'Slovakia', 'SI': 'Slovenia', | |
| 'SB': 'Solomon Islands', 'ZA': 'South Africa', 'ES': 'Spain', 'LK': 'Sri Lanka', 'SD': 'Sudan', | |
| 'SR': 'Suriname', 'SZ': 'Swaziland', 'SE': 'Sweden', 'CH': 'Switzerland', 'SY': 'Syrian Arab Republic', | |
| 'TW': 'Taiwan, Province of China', 'TJ': 'Tajikistan', 'TZ': 'Tanzania, United Republic of', 'TH': 'Thailand', | |
| 'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia', | |
| 'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates', | |
| 'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu', | |
| 'VE': 'Venezuela, Bolivarian Republic of', 'VN': 'Viet Nam', 'ZM': 'Zambia', 'ZW': 'Zimbabwe' | |
| } | |
| # Map country codes to country names | |
| df['region'] = df['region'].map(country_mapping) | |
| # Group by country and count the number of films | |
| df_grouped = df.groupby('region').size().reset_index(name='total_films') | |
| # Apply log transformation to handle outliers | |
| df_grouped['log_total_films'] = np.log1p(df_grouped['total_films']) | |
| # Create a choropleth map with the log-transformed data | |
| fig = px.choropleth(df_grouped, locations='region', locationmode='country names', | |
| color='log_total_films', hover_name='region', | |
| color_continuous_scale='Plasma', # Change the color scheme here | |
| labels={'log_total_films': 'Total Films (log scale)'}) | |
| # Update layout of the map | |
| fig.update_layout(title='Total Films by Country (Log Scale)', | |
| geo=dict(showframe=False, showcoastlines=False, | |
| projection_type='equirectangular')) | |
| return fig | |
| # Function to fetch summary info | |
| def fetch_summary_info(conn): | |
| # Fetch total count of movies | |
| query_total_movies = r''' | |
| SELECT COUNT(*) as total_movies | |
| FROM title_basics | |
| WHERE titleType = 'movie' | |
| ''' | |
| total_movies = pd.read_sql_query(query_total_movies, conn).iloc[0]['total_movies'] | |
| # Fetch total count of years | |
| query_total_years = r''' | |
| SELECT COUNT(DISTINCT startYear) as total_years | |
| FROM title_basics | |
| WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\N' | |
| ''' | |
| total_years = pd.read_sql_query(query_total_years, conn).iloc[0]['total_years'] | |
| # Fetch average rating of movies | |
| query_avg_rating = r''' | |
| SELECT AVG(averageRating) as avg_rating | |
| FROM title_ratings | |
| ''' | |
| avg_rating = pd.read_sql_query(query_avg_rating, conn).iloc[0]['avg_rating'] | |
| return total_movies, total_years, avg_rating | |
| # Main Streamlit app | |
| def run_app(): | |
| st.title('IMDb Movie Data Analysis') | |
| # Load data from SQLite database | |
| conn = load_data('imdb_data.db') | |
| fig_global_map = plot_global_map() | |
| st.plotly_chart(fig_global_map) | |
| # Fetch genre movie releases data | |
| genre_counts = fetch_genre_movie_releases(conn) | |
| # Plot genre movie releases by year (stacked area chart) | |
| fig_genre_movie_releases = plot_stacked_genre_movie_releases(genre_counts) | |
| st.plotly_chart(fig_genre_movie_releases) | |
| # Fetch movie release years data | |
| df_release_years = fetch_movie_release_years(conn) | |
| # Plot movie release years (filled line chart) | |
| fig_movie_release_years = plot_movie_release_years(df_release_years) | |
| st.plotly_chart(fig_movie_release_years) | |
| # Fetch and plot average rating by genre (box plot) | |
| fig_avg_rating_by_genre = fetch_and_plot_average_rating_by_genre(conn) | |
| st.plotly_chart(fig_avg_rating_by_genre) | |
| # Create and display word cloud of genres | |
| create_genre_wordcloud(conn) | |
| # Find and display best movies by genre | |
| best_movies_by_genre = find_best_movies_by_genre(conn) | |
| st.subheader('Best Movies by Genre') | |
| st.dataframe(best_movies_by_genre) | |
| # Fetch summary info | |
| total_movies, total_years, avg_rating = fetch_summary_info(conn) | |
| # Display summary info | |
| st.subheader('Summary Info') | |
| st.metric(label='Total Movies', value=total_movies) | |
| st.metric(label='Total Years of Movie Data', value=total_years) | |
| st.metric(label='Average Movie Rating', value=f'{avg_rating:.2f}') | |
| # Close database connection | |
| conn.close() | |
| if __name__ == '__main__': | |
| run_app() |