Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,51 +1,84 @@
|
|
| 1 |
import sqlite3
|
| 2 |
import pandas as pd
|
| 3 |
-
import
|
| 4 |
from wordcloud import WordCloud
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Function to load data from SQLite database
|
|
|
|
| 9 |
def load_data(db_file):
|
| 10 |
conn = sqlite3.connect(db_file)
|
| 11 |
return conn
|
| 12 |
|
| 13 |
# Function to fetch data from database based on query
|
|
|
|
| 14 |
def fetch_data(conn, query):
|
| 15 |
return pd.read_sql_query(query, conn)
|
| 16 |
|
| 17 |
-
# Function to
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
FROM title_basics
|
| 22 |
-
WHERE titleType = 'movie'
|
| 23 |
-
GROUP BY startYear
|
| 24 |
-
ORDER BY startYear
|
| 25 |
'''
|
| 26 |
-
|
| 27 |
-
return df
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
query = '''
|
| 32 |
SELECT COUNT(DISTINCT startYear) as total_years
|
| 33 |
FROM title_basics
|
| 34 |
WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
|
| 35 |
'''
|
| 36 |
-
|
| 37 |
-
return df
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
query = '''
|
| 42 |
SELECT AVG(averageRating) as avg_rating
|
| 43 |
FROM title_ratings
|
| 44 |
'''
|
| 45 |
-
|
| 46 |
-
return df
|
| 47 |
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def create_genre_wordcloud(conn):
|
| 50 |
query = '''
|
| 51 |
SELECT genres
|
|
@@ -53,85 +86,77 @@ def create_genre_wordcloud(conn):
|
|
| 53 |
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
| 54 |
'''
|
| 55 |
df = fetch_data(conn, query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
# Generate word cloud
|
| 61 |
-
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_genres)
|
| 62 |
-
|
| 63 |
-
# Plot word cloud
|
| 64 |
-
plt.figure(figsize=(10, 5))
|
| 65 |
-
plt.imshow(wordcloud, interpolation='bilinear')
|
| 66 |
-
plt.axis('off')
|
| 67 |
-
plt.tight_layout()
|
| 68 |
-
|
| 69 |
-
# Save word cloud to a temporary file
|
| 70 |
-
temp_file = '/tmp/wordcloud.png'
|
| 71 |
-
plt.savefig(temp_file)
|
| 72 |
-
|
| 73 |
-
return temp_file
|
| 74 |
-
|
| 75 |
-
# Function to plot globe map using Plotly
|
| 76 |
-
def plot_globe_map(df):
|
| 77 |
-
fig = px.scatter_geo(df, lat='latitude', lon='longitude', hover_name='primaryTitle',
|
| 78 |
-
color='country', size='runtimeMinutes', size_max=20,
|
| 79 |
-
projection='natural earth')
|
| 80 |
-
fig.update_geos(showland=True, landcolor='rgb(217, 217, 217)',
|
| 81 |
-
countrycolor='rgb(0, 0, 0)')
|
| 82 |
-
fig.update_layout(title='Movie Locations on Globe',
|
| 83 |
-
margin={"r":0,"t":0,"l":0,"b":0})
|
| 84 |
-
return fig
|
| 85 |
|
| 86 |
-
#
|
| 87 |
-
def main():
|
| 88 |
-
st.title('Movie Locations Dashboard')
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# Load data from SQLite database
|
| 98 |
db_file = 'imdb_data.db' # Adjust path as needed
|
| 99 |
conn = load_data(db_file)
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
|
| 107 |
-
|
|
|
|
| 108 |
|
| 109 |
-
#
|
| 110 |
-
|
| 111 |
-
avg_rating = df_avg_rating.iloc[0]['avg_rating']
|
| 112 |
|
| 113 |
-
#
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
|
| 116 |
# Close database connection
|
| 117 |
conn.close()
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
# Column 1: Total Count of Years
|
| 123 |
-
with col1:
|
| 124 |
-
st.subheader('Total Count of Years')
|
| 125 |
-
st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{total_years}</p>", unsafe_allow_html=True)
|
| 126 |
-
|
| 127 |
-
# Column 2: Total Movie Releases
|
| 128 |
-
with col2:
|
| 129 |
-
st.subheader('Total Movie Releases')
|
| 130 |
-
st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{total_movies}</p>", unsafe_allow_html=True)
|
| 131 |
-
|
| 132 |
-
# Column 3: Average Rating
|
| 133 |
-
with col3:
|
| 134 |
-
st.subheader('Average Rating')
|
| 135 |
-
st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
|
| 136 |
-
|
| 137 |
-
# Display word cloud of ge
|
|
|
|
| 1 |
import sqlite3
|
| 2 |
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
from wordcloud import WordCloud
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
+
from collections import Counter
|
| 7 |
+
import numpy as np
|
| 8 |
+
import streamlit as st
|
| 9 |
|
| 10 |
# Function to load data from SQLite database
|
| 11 |
+
@st.cache(allow_output_mutation=True)
|
| 12 |
def load_data(db_file):
|
| 13 |
conn = sqlite3.connect(db_file)
|
| 14 |
return conn
|
| 15 |
|
| 16 |
# Function to fetch data from database based on query
|
| 17 |
+
@st.cache(allow_output_mutation=True)
|
| 18 |
def fetch_data(conn, query):
|
| 19 |
return pd.read_sql_query(query, conn)
|
| 20 |
|
| 21 |
+
# Function to fetch summary info from database
|
| 22 |
+
@st.cache(allow_output_mutation=True)
|
| 23 |
+
def fetch_summary_info(conn):
|
| 24 |
+
# Fetch total count of movies
|
| 25 |
+
query_total_movies = '''
|
| 26 |
+
SELECT COUNT(*) as total_movies
|
| 27 |
FROM title_basics
|
| 28 |
+
WHERE titleType = 'movie'
|
|
|
|
|
|
|
| 29 |
'''
|
| 30 |
+
total_movies = fetch_data(conn, query_total_movies).iloc[0]['total_movies']
|
|
|
|
| 31 |
|
| 32 |
+
# Fetch total count of years
|
| 33 |
+
query_total_years = '''
|
|
|
|
| 34 |
SELECT COUNT(DISTINCT startYear) as total_years
|
| 35 |
FROM title_basics
|
| 36 |
WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
|
| 37 |
'''
|
| 38 |
+
total_years = fetch_data(conn, query_total_years).iloc[0]['total_years']
|
|
|
|
| 39 |
|
| 40 |
+
# Fetch average rating of movies
|
| 41 |
+
query_avg_rating = '''
|
|
|
|
| 42 |
SELECT AVG(averageRating) as avg_rating
|
| 43 |
FROM title_ratings
|
| 44 |
'''
|
| 45 |
+
avg_rating = fetch_data(conn, query_avg_rating).iloc[0]['avg_rating']
|
|
|
|
| 46 |
|
| 47 |
+
return total_movies, total_years, avg_rating
|
| 48 |
+
|
| 49 |
+
# Function to plot global map of total films per region
|
| 50 |
+
@st.cache(allow_output_mutation=True)
|
| 51 |
+
def plot_global_map(df):
|
| 52 |
+
# Country code to name mapping (only a few examples shown for brevity)
|
| 53 |
+
country_mapping = {
|
| 54 |
+
'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa',
|
| 55 |
+
# Add more mappings as per your original list
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# Map country codes to country names
|
| 59 |
+
df['region'] = df['region'].map(country_mapping)
|
| 60 |
+
|
| 61 |
+
# Group by country and count the number of films
|
| 62 |
+
df_grouped = df.groupby('region').size().reset_index(name='total_films')
|
| 63 |
+
|
| 64 |
+
# Apply log transformation to handle outliers
|
| 65 |
+
df_grouped['log_total_films'] = np.log1p(df_grouped['total_films'])
|
| 66 |
+
|
| 67 |
+
# Create a choropleth map with the log-transformed data
|
| 68 |
+
fig = px.choropleth(df_grouped, locations='region', locationmode='country names',
|
| 69 |
+
color='log_total_films', hover_name='region',
|
| 70 |
+
color_continuous_scale='Plasma', # Change the color scheme here
|
| 71 |
+
labels={'log_total_films': 'Total Films (log scale)'})
|
| 72 |
+
|
| 73 |
+
# Update layout of the map
|
| 74 |
+
fig.update_layout(title='Total Films by Country (Log Scale)',
|
| 75 |
+
geo=dict(showframe=False, showcoastlines=False,
|
| 76 |
+
projection_type='equirectangular'))
|
| 77 |
+
|
| 78 |
+
return fig
|
| 79 |
+
|
| 80 |
+
# Function to create word cloud of genres
|
| 81 |
+
@st.cache(allow_output_mutation=True)
|
| 82 |
def create_genre_wordcloud(conn):
|
| 83 |
query = '''
|
| 84 |
SELECT genres
|
|
|
|
| 86 |
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
| 87 |
'''
|
| 88 |
df = fetch_data(conn, query)
|
| 89 |
+
|
| 90 |
+
# Process genres
|
| 91 |
+
genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
|
| 92 |
+
genre_counts = Counter(genres)
|
| 93 |
+
|
| 94 |
+
# Generate the word cloud
|
| 95 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
|
| 96 |
+
|
| 97 |
+
# Display the word cloud using Streamlit
|
| 98 |
+
st.image(wordcloud.to_array(), use_column_width=True)
|
| 99 |
|
| 100 |
+
# Save word cloud to a temporary file (optional)
|
| 101 |
+
# temp_file = '/tmp/wordcloud.png'
|
| 102 |
+
# plt.savefig(temp_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
# return temp_file
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
# Function to find best movie of each genre by numVotes * averageRating
|
| 107 |
+
@st.cache(allow_output_mutation=True)
|
| 108 |
+
def find_best_movies_by_genre(conn):
|
| 109 |
+
query = '''
|
| 110 |
+
SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
|
| 111 |
+
FROM title_basics tb
|
| 112 |
+
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
| 113 |
+
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
| 114 |
+
'''
|
| 115 |
+
df = fetch_data(conn, query)
|
| 116 |
|
| 117 |
+
# Split genres and select the first genre for each movie
|
| 118 |
+
df['genre'] = df['genres'].str.split(',', expand=True)[0]
|
| 119 |
+
|
| 120 |
+
# Calculate score based on numVotes * averageRating
|
| 121 |
+
df['score'] = df['numVotes'] * df['averageRating']
|
| 122 |
+
|
| 123 |
+
# Get the best movie (highest score) for each genre
|
| 124 |
+
idx = df.groupby('genre')['score'].idxmax()
|
| 125 |
+
best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
|
| 126 |
+
.sort_values(by='score', ascending=False).reset_index(drop=True)
|
| 127 |
+
|
| 128 |
+
return best_movies_by_genre
|
| 129 |
+
|
| 130 |
+
# Main function to orchestrate the dashboard
|
| 131 |
+
def main():
|
| 132 |
+
st.title('IMDb Dashboard')
|
| 133 |
|
| 134 |
# Load data from SQLite database
|
| 135 |
db_file = 'imdb_data.db' # Adjust path as needed
|
| 136 |
conn = load_data(db_file)
|
| 137 |
|
| 138 |
+
# Fetch and display summary info
|
| 139 |
+
total_movies, total_years, avg_rating = fetch_summary_info(conn)
|
| 140 |
+
st.write(f"Total Movies: {total_movies}")
|
| 141 |
+
st.write(f"Total Years: {total_years}")
|
| 142 |
+
st.write(f"Average Rating: {avg_rating:.2f}")
|
| 143 |
|
| 144 |
+
# Display global map of total films per region
|
| 145 |
+
df_movie_region = pd.read_csv('movie_region.csv') # Replace with your actual CSV loading
|
| 146 |
+
fig = plot_global_map(df_movie_region)
|
| 147 |
+
st.plotly_chart(fig)
|
| 148 |
|
| 149 |
+
# Display word cloud of genres
|
| 150 |
+
create_genre_wordcloud(conn)
|
|
|
|
| 151 |
|
| 152 |
+
# Find and display the best movie of each genre
|
| 153 |
+
best_movies_by_genre = find_best_movies_by_genre(conn)
|
| 154 |
+
st.subheader("Best Movie of Each Genre:")
|
| 155 |
+
st.write(best_movies_by_genre)
|
| 156 |
|
| 157 |
# Close database connection
|
| 158 |
conn.close()
|
| 159 |
|
| 160 |
+
# Execute the main function
|
| 161 |
+
if __name__ == '__main__':
|
| 162 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|