Spaces:
Sleeping
Sleeping
import sqlite3 | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import streamlit as st | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
from collections import Counter | |
import numpy as np | |
import imageio | |
def load_data(db_file): | |
conn = sqlite3.connect(db_file) | |
return conn | |
genre_color_map = { | |
'Documentary': '#FFB3BA', # Light Pink | |
'Animation': '#BAFFC9', # Light Green | |
'Comedy': '#FFFFBA', # Light Yellow | |
'Short': '#BAE1FF', # Light Blue | |
'Romance': '#FFDFBA', # Light Peach | |
'News': '#E1BAFF', # Light Purple | |
'Drama': '#FFC6C6', # Light Red | |
'Fantasy': '#C6FFBA', # Light Lime | |
'Horror': '#D3D3D3', # Light Gray | |
'Biography': '#FFE4B5', # Moccasin | |
'Music': '#B0E0E6', # Powder Blue | |
'Crime': '#F0E68C', # Khaki | |
'Family': '#98FB98', # Pale Green | |
'Action': '#FFA07A', # Light Salmon | |
'History': '#DEB887', # Burlywood | |
'Adventure': '#87CEFA', # Light Sky Blue | |
'Mystery': '#DDA0DD', # Plum | |
'Musical': '#FFB6C1', # Light Pink | |
'War': '#B0C4DE', # Light Steel Blue | |
'Sci-Fi': '#90EE90', # Light Green | |
'Western': '#F4A460', # Sandy Brown | |
'Thriller': '#FA8072', # Salmon | |
'Sport': '#20B2AA', # Light Sea Green | |
'Film-Noir': '#778899', # Light Slate Gray | |
'Talk-Show': '#FAFAD2', # Light Goldenrod Yellow | |
'Game-Show': '#FFC0CB', # Pink | |
'Adult': '#DB7093', # Pale Violet Red | |
'Reality-TV': '#F08080' # Light Coral | |
} | |
def fetch_genre_movie_releases(conn): | |
query = r''' | |
SELECT startYear, genres | |
FROM title_basics | |
WHERE titleType = 'movie' AND startYear != '\N' AND genres != '\N' | |
''' | |
df = pd.read_sql_query(query, conn) | |
df['genres'] = df['genres'].str.split(',') | |
df = df.explode('genres') | |
df['startYear'] = pd.to_numeric(df['startYear']) | |
genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count') | |
return genre_counts | |
def fetch_movie_release_years(conn): | |
query_release_years = r''' | |
SELECT startYear, COUNT(*) as count | |
FROM title_basics | |
WHERE titleType = 'movie' AND startYear != '\N' | |
GROUP BY startYear | |
ORDER BY startYear | |
''' | |
df_release_years = pd.read_sql_query(query_release_years, conn) | |
return df_release_years | |
def fetch_and_plot_average_rating_by_genre(conn): | |
query = r''' | |
SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres | |
FROM title_basics tb | |
JOIN title_ratings tr ON tb.tconst = tr.tconst | |
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\N' | |
''' | |
df = pd.read_sql_query(query, conn) | |
def extract_first_genre(genres): | |
if genres: | |
return genres.split(',')[0].strip() | |
else: | |
return None | |
df['first_genre'] = df['genres'].apply(extract_first_genre) | |
df = df.dropna(subset=['first_genre']) | |
fig = px.box(df, x='first_genre', y='averageRating', | |
labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'}, | |
title='Average Rating of Movies by First Genre', | |
color='first_genre', | |
color_discrete_map=genre_color_map) | |
def genre_color_func(word, font_size, position, orientation, random_state=None, **kwargs): | |
return genre_color_map.get(word, '#FFFFFF') | |
def create_genre_wordcloud(conn): | |
query = r''' | |
SELECT genres | |
FROM title_basics | |
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\N' | |
''' | |
df = pd.read_sql_query(query, conn) | |
genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True) | |
genre_counts = Counter(genres) | |
wordcloud = WordCloud(width=800, height=800, background_color='white', color_func=genre_color_func).generate_from_frequencies(genre_counts) | |
plt.figure(figsize=(10, 10)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
st.pyplot(plt.gcf()) | |
def find_best_movies_by_genre(conn): | |
query = r''' | |
SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes | |
FROM title_basics tb | |
JOIN title_ratings tr ON tb.tconst = tr.tconst | |
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\N' | |
''' | |
df = pd.read_sql_query(query, conn) | |
df['genre'] = df['genres'].str.split(',', expand=True)[0] | |
df['score'] = df['numVotes'] * df['averageRating'] | |
idx = df.groupby('genre')['score'].idxmax() | |
best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \ | |
.sort_values(by='score', ascending=False).reset_index(drop=True) | |
return best_movies_by_genre | |
def plot_stacked_genre_movie_releases(genre_counts): | |
fig = px.area(genre_counts, x='startYear', y='count', color='genres', | |
title='Stacked Genre Movie Releases by Year', | |
labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'}, | |
line_group='genres', # This groups lines by genre | |
hover_name='genres', # This sets the genre as the hover label | |
hover_data={'count': ':.0f'}, # Format hover data as integer | |
color_discrete_map=genre_color_map) # Apply color map | |
return fig | |
def plot_global_map(conn): | |
movie_region_df = pd.read_csv('movie_region.csv') | |
# SQL query to get unique first genre of each title | |
query_genre = ''' | |
SELECT tconst AS titleId, primaryTitle, | |
CASE | |
WHEN instr(genres, ',') > 0 THEN substr(genres, 1, instr(genres, ',') - 1) | |
ELSE genres | |
END AS first_genre | |
FROM title_basics; | |
''' | |
genre_data_df = pd.read_sql_query(query_genre, conn) | |
merged_df = pd.merge(movie_region_df, genre_data_df, on='titleId', how='inner') | |
df = merged_df.replace('\\N', np.nan).dropna(subset=['first_genre']) | |
grouped = df.groupby('region')['first_genre'].agg(lambda x: ', '.join(x)).reset_index() | |
grouped['genres_list'] = grouped['first_genre'].apply(lambda x: x.split(', ')) | |
grouped['most_common_genre'] = grouped['genres_list'].apply(lambda x: pd.Series(x).value_counts().index[0] if len(x) > 0 else '') | |
result = grouped[['region', 'most_common_genre']].copy() | |
country_mapping = { | |
'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa', | |
'AD': 'Andorra', 'AO': 'Angola', 'AI': 'Anguilla', 'AQ': 'Antarctica', 'AG': 'Antigua and Barbuda', | |
'AR': 'Argentina', 'AM': 'Armenia', 'AW': 'Aruba', 'AU': 'Australia', 'AT': 'Austria', | |
'AZ': 'Azerbaijan', 'BS': 'Bahamas', 'BH': 'Bahrain', 'BD': 'Bangladesh', 'BB': 'Barbados', | |
'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize', 'BJ': 'Benin', 'BM': 'Bermuda', | |
'BT': 'Bhutan', 'BO': 'Bolivia', 'BA': 'Bosnia and Herzegovina', 'BW': 'Botswana', 'BR': 'Brazil', | |
'BN': 'Brunei Darussalam', 'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'KH': 'Cambodia', | |
'CM': 'Cameroon', 'CA': 'Canada', 'CV': 'Cape Verde', 'KY': 'Cayman Islands', 'CF': 'Central African Republic', | |
'TD': 'Chad', 'CL': 'Chile', 'CN': 'China', 'CO': 'Colombia', 'KM': 'Comoros', | |
'CG': 'Congo', 'CD': 'Congo, Democratic Republic of the', 'CK': 'Cook Islands', 'CR': 'Costa Rica', 'HR': 'Croatia', | |
'CU': 'Cuba', 'CY': 'Cyprus', 'CZ': 'Czech Republic', 'DK': 'Denmark', 'DJ': 'Djibouti', | |
'DM': 'Dominica', 'DO': 'Dominican Republic', 'EC': 'Ecuador', 'EG': 'Egypt', 'SV': 'El Salvador', | |
'GQ': 'Equatorial Guinea', 'ER': 'Eritrea', 'EE': 'Estonia', 'ET': 'Ethiopia', 'FJ': 'Fiji', | |
'FI': 'Finland', 'FR': 'France', 'GA': 'Gabon', 'GM': 'Gambia', 'GE': 'Georgia', | |
'DE': 'Germany', 'GH': 'Ghana', 'GR': 'Greece', 'GL': 'Greenland', 'GD': 'Grenada', | |
'GU': 'Guam', 'GT': 'Guatemala', 'GN': 'Guinea', 'GW': 'Guinea-Bissau', 'GY': 'Guyana', | |
'HT': 'Haiti', 'VA': 'Holy See (Vatican City State)', 'HN': 'Honduras', 'HK': 'Hong Kong', 'HU': 'Hungary', | |
'IS': 'Iceland', 'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran', 'IQ': 'Iraq', | |
'IE': 'Ireland', 'IL': 'Israel', 'IT': 'Italy', 'CI': "Cote d'Ivoire", 'JM': 'Jamaica', | |
'JP': 'Japan', 'JO': 'Jordan', 'KZ': 'Kazakhstan', 'KE': 'Kenya', 'KI': 'Kiribati', | |
'KP': "Korea, Democratic People's Republic of", 'KR': 'Korea, Republic of', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan', 'LA': "Lao People's Democratic Republic", | |
'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia', 'LY': 'Libyan Arab Jamahiriya', | |
'LI': 'Liechtenstein', 'LT': 'Lithuania', 'LU': 'Luxembourg', 'MO': 'Macao', 'MK': 'Macedonia, The Former Yugoslav Republic of', | |
'MG': 'Madagascar', 'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali', | |
'MT': 'Malta', 'MH': 'Marshall Islands', 'MR': 'Mauritania', 'MU': 'Mauritius', 'YT': 'Mayotte', | |
'MX': 'Mexico', 'FM': 'Micronesia', 'MD': 'Moldova, Republic of', 'MC': 'Monaco', 'MN': 'Mongolia', | |
'ME': 'Montenegro', 'MS': 'Montserrat', 'MA': 'Morocco', 'MZ': 'Mozambique', 'MM': 'Myanmar', | |
'NA': 'Namibia', 'NR': 'Nauru', 'NP': 'Nepal', 'NL': 'Netherlands', 'AN': 'Netherlands Antilles', | |
'NC': 'New Caledonia', 'NZ': 'New Zealand', 'NI': 'Nicaragua', 'NE': 'Niger', 'NG': 'Nigeria', | |
'NU': 'Niue', 'NF': 'Norfolk Island', 'MP': 'Northern Mariana Islands', 'NO': 'Norway', 'OM': 'Oman', | |
'PK': 'Pakistan', 'PW': 'Palau', 'PS': 'Palestinian Territory, Occupied', 'PA': 'Panama', 'PG': 'Papua New Guinea', | |
'PY': 'Paraguay', 'PE': 'Peru', 'PH': 'Philippines', 'PN': 'Pitcairn', 'PL': 'Poland', | |
'PT': 'Portugal', 'PR': 'Puerto Rico', 'QA': 'Qatar', 'RO': 'Romania', 'RU': 'Russian Federation', | |
'RW': 'Rwanda', 'RE': 'Reunion', 'BL': 'Saint Barthelemy', 'SH': 'Saint Helena', 'KN': 'Saint Kitts and Nevis', | |
'LC': 'Saint Lucia', 'MF': 'Saint Martin', 'PM': 'Saint Pierre and Miquelon', 'VC': 'Saint Vincent and the Grenadines', 'WS': 'Samoa', | |
'SM': 'San Marino', 'ST': 'Sao Tome and Principe', 'SA': 'Saudi Arabia', 'SN': 'Senegal', 'RS': 'Serbia', | |
'SC': 'Seychelles', 'SL': 'Sierra Leone', 'SG': 'Singapore', 'SK': 'Slovakia', 'SI': 'Slovenia', | |
'SB': 'Solomon Islands', 'SO': 'Somalia', 'ZA': 'South Africa', 'GS': 'South Georgia and the South Sandwich Islands', 'ES': 'Spain', | |
'LK': 'Sri Lanka', 'SD': 'Sudan', 'SR': 'Suriname', 'SJ': 'Svalbard and Jan Mayen', 'SZ': 'Swaziland', | |
'SE': 'Sweden', 'CH': 'Switzerland', 'SY': 'Syrian Arab Republic', 'TW': 'Taiwan', 'TJ': 'Tajikistan', | |
'TZ': 'Tanzania, United Republic of', 'TH': 'Thailand', 'TL': 'Timor-Leste', 'TG': 'Togo', 'TK': 'Tokelau', | |
'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia', 'TR': 'Turkey', 'TM': 'Turkmenistan', | |
'TV': 'Tuvalu', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates', 'GB': 'United Kingdom', | |
'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu', 'VE': 'Venezuela', | |
'VN': 'Viet Nam', 'VG': 'Virgin Islands, British', 'VI': 'Virgin Islands, U.S.', 'WF': 'Wallis and Futuna', 'EH': 'Western Sahara', | |
'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe' | |
} | |
result.loc[:, 'region'] = result['region'].map(country_mapping) | |
genre_color_map = { | |
'Documentary': '#FFB3BA', # Light Pink | |
'Animation': '#BAFFC9', # Light Green | |
'Comedy': '#FFFFBA', # Light Yellow | |
'Short': '#BAE1FF', # Light Blue | |
'Romance': '#FFDFBA', # Light Peach | |
'News': '#E1BAFF', # Light Purple | |
'Drama': '#FFC6C6', # Light Red | |
'Fantasy': '#C6FFBA', # Light Lime | |
'Horror': '#D3D3D3', # Light Gray | |
'Biography': '#FFE4B5', # Moccasin | |
'Music': '#B0E0E6', # Powder Blue | |
'Crime': '#F0E68C', # Khaki | |
'Family': '#98FB98', # Pale Green | |
'Action': '#FFA07A', # Light Salmon | |
'History': '#DEB887', # Burlywood | |
'Adventure': '#87CEFA', # Light Sky Blue | |
'Mystery': '#DDA0DD', # Plum | |
'Musical': '#FFB6C1', # Light Pink | |
'War': '#B0C4DE', # Light Steel Blue | |
'Sci-Fi': '#90EE90', # Light Green | |
'Western': '#F4A460', # Sandy Brown | |
'Thriller': '#FA8072', # Salmon | |
'Sport': '#20B2AA', # Light Sea Green | |
'Film-Noir': '#778899', # Light Slate Gray | |
'Talk-Show': '#FAFAD2', # Light Goldenrod Yellow | |
'Game-Show': '#FFC0CB', # Pink | |
'Adult': '#DB7093', # Pale Violet Red | |
'Reality-TV': '#F08080' # Light Coral | |
} | |
fig = px.choropleth( | |
result, | |
locations='region', | |
locationmode='country names', | |
color='most_common_genre', | |
hover_name='region', | |
hover_data={'region': False, 'most_common_genre': True}, | |
title='Most Common Movie Genre by Country', | |
color_discrete_map=genre_color_map, | |
) | |
# Update the layout | |
fig.update_layout( | |
geo=dict(showframe=False, showcoastlines=True, projection_type='natural earth') | |
) | |
return fig | |
# Function to fetch summary info | |
def fetch_summary_info(conn): | |
# Fetch total count of movies | |
query_total_movies = r''' | |
SELECT COUNT(*) as total_movies | |
FROM title_basics | |
WHERE titleType = 'movie' | |
''' | |
total_movies = pd.read_sql_query(query_total_movies, conn).iloc[0]['total_movies'] | |
# Fetch total count of years | |
query_total_years = r''' | |
SELECT COUNT(DISTINCT startYear) as total_years | |
FROM title_basics | |
WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\N' | |
''' | |
total_years = pd.read_sql_query(query_total_years, conn).iloc[0]['total_years'] | |
# Fetch average rating of movies | |
query_avg_rating = r''' | |
SELECT AVG(averageRating) as avg_rating | |
FROM title_ratings | |
''' | |
avg_rating = pd.read_sql_query(query_avg_rating, conn).iloc[0]['avg_rating'] | |
return total_movies, total_years, avg_rating | |
# Main Streamlit app | |
def run_app(): | |
st.title('IMDb Movie Data Analysis') | |
# Load data from SQLite database | |
conn = load_data('imdb_data.db') | |
genre_counts = fetch_genre_movie_releases(conn) | |
total_movies, total_years, avg_rating = fetch_summary_info(conn) | |
# Layout for summary info in three columns | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.subheader('Total Movies') | |
st.metric(label='zzz', value=total_movies) | |
with col2: | |
st.subheader('Total Years of Movie Data') | |
st.metric(label='zzz', value=total_years) | |
with col3: | |
st.subheader('Average Movie Rating') | |
st.metric(label='zzz', value=f'{avg_rating:.2f}') | |
# Find and display best movies by genre | |
best_movies_by_genre = find_best_movies_by_genre(conn) | |
fig_global_map = plot_global_map(conn) | |
fig_genre_movie_releases = plot_stacked_genre_movie_releases(genre_counts) | |
# Layout for best movies by genre in two columns | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader('Genre Movie Releases by Year') | |
st.plotly_chart(fig_genre_movie_releases, use_container_width=True) | |
with col2: | |
st.subheader('Global Map of Films') | |
st.plotly_chart(fig_global_map, use_container_width=True) | |
fig_avg_rating_by_genre = fetch_and_plot_average_rating_by_genre(conn) | |
# Layout for Plotly charts in three columns | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.subheader('Best Movies by Genre') | |
st.dataframe(best_movies_by_genre) | |
with col2: | |
create_genre_wordcloud(conn) | |
with col3: | |
st.subheader('Average Rating by Genre') | |
st.plotly_chart(fig_avg_rating_by_genre, use_container_width=True) | |
# Close database connection | |
conn.close() | |
if __name__ == '__main__': | |
run_app() |