Spaces:

Penguni
/

dashboardmovie

Sleeping

App Files Files Community

dashboardmovie / app.py

Penguni

Update app.py

a8f19d1 verified about 1 year ago

raw

history blame

14 kB

	import sqlite3
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import streamlit as st
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from collections import Counter
	import numpy as np
	import imageio

	# Function to load data from SQLite database
	def load_data(db_file):
	conn = sqlite3.connect(db_file)
	return conn

	# Function to fetch genre movie releases by year
	def fetch_genre_movie_releases(conn):
	query = r'''
	SELECT startYear, genres
	FROM title_basics
	WHERE titleType = 'movie' AND startYear != '\N' AND genres != '\N'
	'''
	df = pd.read_sql_query(query, conn)

	# Split genres and explode to separate rows
	df['genres'] = df['genres'].str.split(',')
	df = df.explode('genres')

	# Convert startYear to numeric
	df['startYear'] = pd.to_numeric(df['startYear'])

	# Group by startYear and genre, count the number of movies
	genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')

	return genre_counts

	# Function to fetch data for filled line chart of movie release years
	def fetch_movie_release_years(conn):
	query_release_years = r'''
	SELECT startYear, COUNT(*) as count
	FROM title_basics
	WHERE titleType = 'movie' AND startYear != '\N'
	GROUP BY startYear
	ORDER BY startYear
	'''
	df_release_years = pd.read_sql_query(query_release_years, conn)
	return df_release_years

	# Function to fetch data and create box plot of average rating by first_genre
	def fetch_and_plot_average_rating_by_genre(conn):
	query = r'''
	SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres
	FROM title_basics tb
	JOIN title_ratings tr ON tb.tconst = tr.tconst
	WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\N'
	'''
	df = pd.read_sql_query(query, conn)

	# Function to extract the first genre from the genres list
	def extract_first_genre(genres):
	if genres:
	return genres.split(',')[0].strip()
	else:
	return None

	# Apply the function to extract the first genre
	df['first_genre'] = df['genres'].apply(extract_first_genre)

	# Drop rows where first_genre is None (shouldn't be necessary if genres column is clean)
	df = df.dropna(subset=['first_genre'])

	# Create a box plot of average rating by first_genre
	fig = px.box(df, x='first_genre', y='averageRating',
	labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
	title='Average Rating of Movies by First Genre')

	return fig

	# Function to create word cloud of genres
	def create_genre_wordcloud(conn, mask_path):
	query = r'''
	SELECT genres
	FROM title_basics
	WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\N'
	'''
	df = pd.read_sql_query(query, conn)

	# Process genres
	genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
	genre_counts = Counter(genres)

	# Load the mask image
	mask = imageio.imread(mask_path)

	# Generate the word cloud with the mask
	wordcloud = WordCloud(width=800, height=400, background_color='white', mask=mask).generate_from_frequencies(genre_counts)

	# Display the word cloud
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	st.pyplot(plt.gcf())

	# Function to find best movie of each genre by numVotes * averageRating
	def find_best_movies_by_genre(conn):
	query = r'''
	SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
	FROM title_basics tb
	JOIN title_ratings tr ON tb.tconst = tr.tconst
	WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\N'
	'''
	df = pd.read_sql_query(query, conn)

	# Split genres and select the first genre for each movie
	df['genre'] = df['genres'].str.split(',', expand=True)[0]

	# Calculate score based on numVotes * averageRating
	df['score'] = df['numVotes'] * df['averageRating']

	# Get the best movie (highest score) for each genre
	idx = df.groupby('genre')['score'].idxmax()
	best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
	.sort_values(by='score', ascending=False).reset_index(drop=True)

	return best_movies_by_genre

	# Function to plot stacked area chart of genre movie releases by year using Plotly Express
	def plot_stacked_genre_movie_releases(genre_counts):
	fig = px.area(genre_counts, x='startYear', y='count', color='genres',
	title='Stacked Genre Movie Releases by Year',
	labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'},
	line_group='genres', # This groups lines by genre
	hover_name='genres', # This sets the genre as the hover label
	hover_data={'count': ':.0f'}) # Format hover data as integer

	return fig

	# Function to plot global map of total films per region using Plotly Express

	def plot_global_map():
	# Read data from CSV file
	df = pd.read_csv('movie_region.csv')

	# Country code to name mapping
	country_mapping = {
	'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa',
	'AD': 'Andorra', 'AO': 'Angola', 'AI': 'Anguilla', 'AQ': 'Antarctica', 'AG': 'Antigua and Barbuda',
	'AR': 'Argentina', 'AM': 'Armenia', 'AW': 'Aruba', 'AU': 'Australia', 'AT': 'Austria',
	'AZ': 'Azerbaijan', 'BS': 'Bahamas', 'BH': 'Bahrain', 'BD': 'Bangladesh', 'BB': 'Barbados',
	'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize', 'BJ': 'Benin', 'BM': 'Bermuda',
	'BT': 'Bhutan', 'BO': 'Bolivia', 'BA': 'Bosnia and Herzegovina', 'BW': 'Botswana', 'BR': 'Brazil',
	'BN': 'Brunei Darussalam', 'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'KH': 'Cambodia',
	'CM': 'Cameroon', 'CA': 'Canada', 'CV': 'Cape Verde', 'KY': 'Cayman Islands', 'CF': 'Central African Republic',
	'TD': 'Chad', 'CL': 'Chile', 'CN': 'China', 'CO': 'Colombia', 'KM': 'Comoros',
	'CG': 'Congo', 'CD': 'Congo, Democratic Republic of the', 'CK': 'Cook Islands', 'CR': 'Costa Rica', 'HR': 'Croatia',
	'CU': 'Cuba', 'CY': 'Cyprus', 'CZ': 'Czech Republic', 'DK': 'Denmark', 'DJ': 'Djibouti',
	'DM': 'Dominica', 'DO': 'Dominican Republic', 'EC': 'Ecuador', 'EG': 'Egypt', 'SV': 'El Salvador',
	'GQ': 'Equatorial Guinea', 'ER': 'Eritrea', 'EE': 'Estonia', 'ET': 'Ethiopia', 'FJ': 'Fiji',
	'FI': 'Finland', 'FR': 'France', 'GA': 'Gabon', 'GM': 'Gambia', 'GE': 'Georgia',
	'DE': 'Germany', 'GH': 'Ghana', 'GR': 'Greece', 'GL': 'Greenland', 'GD': 'Grenada',
	'GU': 'Guam', 'GT': 'Guatemala', 'GN': 'Guinea', 'GW': 'Guinea-Bissau', 'GY': 'Guyana',
	'HT': 'Haiti', 'VA': 'Holy See (Vatican City State)', 'HN': 'Honduras', 'HK': 'Hong Kong', 'HU': 'Hungary',
	'IS': 'Iceland', 'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran', 'IQ': 'Iraq',
	'IE': 'Ireland', 'IL': 'Israel', 'IT': 'Italy', 'CI': "Cote d'Ivoire", 'JM': 'Jamaica',
	'JP': 'Japan', 'JO': 'Jordan', 'KZ': 'Kazakhstan', 'KE': 'Kenya', 'KI': 'Kiribati',
	'KP': "Korea, Democratic People's Republic of", 'KR': 'Korea, Republic of', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan', 'LA': "Lao People's Democratic Republic",
	'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia', 'LY': 'Libyan Arab Jamahiriya',
	'LI': 'Liechtenstein', 'LT': 'Lithuania', 'LU': 'Luxembourg', 'MO': 'Macao', 'MK': 'Macedonia, The Former Yugoslav Republic of',
	'MG': 'Madagascar', 'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali',
	'MT': 'Malta', 'MH': 'Marshall Islands', 'MR': 'Mauritania', 'MU': 'Mauritius', 'YT': 'Mayotte',
	'MX': 'Mexico', 'FM': 'Micronesia', 'MD': 'Moldova, Republic of', 'MC': 'Monaco', 'MN': 'Mongolia',
	'ME': 'Montenegro', 'MS': 'Montserrat', 'MA': 'Morocco', 'MZ': 'Mozambique', 'MM': 'Myanmar',
	'NA': 'Namibia', 'NR': 'Nauru', 'NP': 'Nepal', 'NL': 'Netherlands', 'AN': 'Netherlands Antilles',
	'NC': 'New Caledonia', 'NZ': 'New Zealand', 'NI': 'Nicaragua', 'NE': 'Niger', 'NG': 'Nigeria',
	'NU': 'Niue', 'NF': 'Norfolk Island', 'MP': 'Northern Mariana Islands', 'NO': 'Norway', 'OM': 'Oman',
	'PK': 'Pakistan', 'PW': 'Palau', 'PS': 'Palestinian Territory, Occupied', 'PA': 'Panama', 'PG': 'Papua New Guinea',
	'PY': 'Paraguay', 'PE': 'Peru', 'PH': 'Philippines', 'PN': 'Pitcairn', 'PL': 'Poland',
	'PT': 'Portugal', 'PR': 'Puerto Rico', 'QA': 'Qatar', 'RO': 'Romania', 'RU': 'Russian Federation',
	'RW': 'Rwanda', 'RE': 'Reunion', 'BL': 'Saint Barthelemy', 'SH': 'Saint Helena', 'KN': 'Saint Kitts and Nevis',
	'LC': 'Saint Lucia', 'MF': 'Saint Martin', 'PM': 'Saint Pierre and Miquelon', 'VC': 'Saint Vincent and the Grenadines', 'WS': 'Samoa',
	'SM': 'San Marino', 'ST': 'Sao Tome and Principe', 'SA': 'Saudi Arabia', 'SN': 'Senegal', 'RS': 'Serbia',
	'SC': 'Seychelles', 'SL': 'Sierra Leone', 'SG': 'Singapore', 'SK': 'Slovakia', 'SI': 'Slovenia',
	'SB': 'Solomon Islands', 'SO': 'Somalia', 'ZA': 'South Africa', 'GS': 'South Georgia and the South Sandwich Islands', 'ES': 'Spain',
	'LK': 'Sri Lanka', 'SD': 'Sudan', 'SR': 'Suriname', 'SJ': 'Svalbard and Jan Mayen', 'SZ': 'Swaziland',
	'SE': 'Sweden', 'CH': 'Switzerland', 'SY': 'Syrian Arab Republic', 'TW': 'Taiwan', 'TJ': 'Tajikistan',
	'TZ': 'Tanzania, United Republic of', 'TH': 'Thailand', 'TL': 'Timor-Leste', 'TG': 'Togo', 'TK': 'Tokelau',
	'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia', 'TR': 'Turkey', 'TM': 'Turkmenistan',
	'TV': 'Tuvalu', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates', 'GB': 'United Kingdom',
	'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu', 'VE': 'Venezuela',
	'VN': 'Viet Nam', 'VG': 'Virgin Islands, British', 'VI': 'Virgin Islands, U.S.', 'WF': 'Wallis and Futuna', 'EH': 'Western Sahara',
	'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
	}

	# Replace country codes with full country names
	df['region'] = df['region'].map(country_mapping)


	# Group by country and count the number of films
	df_grouped = df.groupby('region').size().reset_index(name='total_films')

	# Apply log transformation to handle outliers
	df_grouped['log_total_films'] = np.log1p(df_grouped['total_films'])

	# Create a choropleth map with the log-transformed data
	fig = px.choropleth(
	locations=df_grouped['region'],
	locationmode='country names',
	color=df_grouped['log_total_films'],
	hover_name=df_grouped['region'],
	color_continuous_scale='solar'
	)

	# Update layout for title and other attributes
	fig.update_layout(
	title_font=dict(size=28, family='Arial, sans-serif'),
	title_x=0.5,
	title_y=0.95,
	title_text='Total Number of Movies by Country',
	geo=dict(
	showframe=False,
	showcoastlines=True,
	projection_type="natural earth"
	)
	)

	return fig

	# Function to fetch summary info
	def fetch_summary_info(conn):
	# Fetch total count of movies
	query_total_movies = r'''
	SELECT COUNT(*) as total_movies
	FROM title_basics
	WHERE titleType = 'movie'
	'''
	total_movies = pd.read_sql_query(query_total_movies, conn).iloc[0]['total_movies']

	# Fetch total count of years
	query_total_years = r'''
	SELECT COUNT(DISTINCT startYear) as total_years
	FROM title_basics
	WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\N'
	'''
	total_years = pd.read_sql_query(query_total_years, conn).iloc[0]['total_years']

	# Fetch average rating of movies
	query_avg_rating = r'''
	SELECT AVG(averageRating) as avg_rating
	FROM title_ratings
	'''
	avg_rating = pd.read_sql_query(query_avg_rating, conn).iloc[0]['avg_rating']

	return total_movies, total_years, avg_rating

	# Main Streamlit app
	def run_app():
	st.title('IMDb Movie Data Analysis')

	# Load data from SQLite database
	conn = load_data('imdb_data.db')
	genre_counts = fetch_genre_movie_releases(conn)
	total_movies, total_years, avg_rating = fetch_summary_info(conn)

	# Layout for summary info in three columns
	col1, col2, col3 = st.columns(3)

	with col1:
	st.subheader('Total Movies')
	st.metric(label='zzz', value=total_movies)

	with col2:
	st.subheader('Total Years of Movie Data')
	st.metric(label='zzz', value=total_years)

	with col3:
	st.subheader('Average Movie Rating')
	st.metric(label='zzz', value=f'{avg_rating:.2f}')

	# Find and display best movies by genre
	best_movies_by_genre = find_best_movies_by_genre(conn)
	fig_global_map = plot_global_map()
	fig_genre_movie_releases = plot_stacked_genre_movie_releases(genre_counts)

	# Layout for best movies by genre in two columns
	col1, col2 = st.columns(2)

	with col1:
	st.subheader('Genre Movie Releases by Year')
	st.plotly_chart(fig_genre_movie_releases)

	with col2:
	st.subheader('Global Map of Films')
	st.plotly_chart(fig_global_map, use_container_width=True)

	fig_avg_rating_by_genre = fetch_and_plot_average_rating_by_genre(conn)

	# Layout for Plotly charts in three columns
	col1, col2, col3 = st.columns(3)

	with col1:
	st.subheader('Best Movies by Genre')
	st.dataframe(best_movies_by_genre)

	with col2:
	create_genre_wordcloud(conn, 'mask.png')

	with col3:
	st.subheader('Average Rating by Genre')
	st.plotly_chart(fig_avg_rating_by_genre)

	# Close database connection
	conn.close()

	if __name__ == '__main__':
	run_app()