Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,51 +1,84 @@
|
|
1 |
import sqlite3
|
2 |
import pandas as pd
|
3 |
-
import
|
4 |
from wordcloud import WordCloud
|
5 |
import matplotlib.pyplot as plt
|
6 |
-
|
|
|
|
|
7 |
|
8 |
# Function to load data from SQLite database
|
|
|
9 |
def load_data(db_file):
|
10 |
conn = sqlite3.connect(db_file)
|
11 |
return conn
|
12 |
|
13 |
# Function to fetch data from database based on query
|
|
|
14 |
def fetch_data(conn, query):
|
15 |
return pd.read_sql_query(query, conn)
|
16 |
|
17 |
-
# Function to
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
21 |
FROM title_basics
|
22 |
-
WHERE titleType = 'movie'
|
23 |
-
GROUP BY startYear
|
24 |
-
ORDER BY startYear
|
25 |
'''
|
26 |
-
|
27 |
-
return df
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
query = '''
|
32 |
SELECT COUNT(DISTINCT startYear) as total_years
|
33 |
FROM title_basics
|
34 |
WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
|
35 |
'''
|
36 |
-
|
37 |
-
return df
|
38 |
|
39 |
-
#
|
40 |
-
|
41 |
-
query = '''
|
42 |
SELECT AVG(averageRating) as avg_rating
|
43 |
FROM title_ratings
|
44 |
'''
|
45 |
-
|
46 |
-
return df
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def create_genre_wordcloud(conn):
|
50 |
query = '''
|
51 |
SELECT genres
|
@@ -53,85 +86,77 @@ def create_genre_wordcloud(conn):
|
|
53 |
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
54 |
'''
|
55 |
df = fetch_data(conn, query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
#
|
58 |
-
|
59 |
-
|
60 |
-
# Generate word cloud
|
61 |
-
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_genres)
|
62 |
-
|
63 |
-
# Plot word cloud
|
64 |
-
plt.figure(figsize=(10, 5))
|
65 |
-
plt.imshow(wordcloud, interpolation='bilinear')
|
66 |
-
plt.axis('off')
|
67 |
-
plt.tight_layout()
|
68 |
-
|
69 |
-
# Save word cloud to a temporary file
|
70 |
-
temp_file = '/tmp/wordcloud.png'
|
71 |
-
plt.savefig(temp_file)
|
72 |
-
|
73 |
-
return temp_file
|
74 |
-
|
75 |
-
# Function to plot globe map using Plotly
|
76 |
-
def plot_globe_map(df):
|
77 |
-
fig = px.scatter_geo(df, lat='latitude', lon='longitude', hover_name='primaryTitle',
|
78 |
-
color='country', size='runtimeMinutes', size_max=20,
|
79 |
-
projection='natural earth')
|
80 |
-
fig.update_geos(showland=True, landcolor='rgb(217, 217, 217)',
|
81 |
-
countrycolor='rgb(0, 0, 0)')
|
82 |
-
fig.update_layout(title='Movie Locations on Globe',
|
83 |
-
margin={"r":0,"t":0,"l":0,"b":0})
|
84 |
-
return fig
|
85 |
|
86 |
-
#
|
87 |
-
def main():
|
88 |
-
st.title('Movie Locations Dashboard')
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
# Load data from SQLite database
|
98 |
db_file = 'imdb_data.db' # Adjust path as needed
|
99 |
conn = load_data(db_file)
|
100 |
|
101 |
-
#
|
102 |
-
|
103 |
-
|
|
|
|
|
104 |
|
105 |
-
#
|
106 |
-
|
107 |
-
|
|
|
108 |
|
109 |
-
#
|
110 |
-
|
111 |
-
avg_rating = df_avg_rating.iloc[0]['avg_rating']
|
112 |
|
113 |
-
#
|
114 |
-
|
|
|
|
|
115 |
|
116 |
# Close database connection
|
117 |
conn.close()
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
# Column 1: Total Count of Years
|
123 |
-
with col1:
|
124 |
-
st.subheader('Total Count of Years')
|
125 |
-
st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{total_years}</p>", unsafe_allow_html=True)
|
126 |
-
|
127 |
-
# Column 2: Total Movie Releases
|
128 |
-
with col2:
|
129 |
-
st.subheader('Total Movie Releases')
|
130 |
-
st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{total_movies}</p>", unsafe_allow_html=True)
|
131 |
-
|
132 |
-
# Column 3: Average Rating
|
133 |
-
with col3:
|
134 |
-
st.subheader('Average Rating')
|
135 |
-
st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
|
136 |
-
|
137 |
-
# Display word cloud of ge
|
|
|
1 |
import sqlite3
|
2 |
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
from wordcloud import WordCloud
|
5 |
import matplotlib.pyplot as plt
|
6 |
+
from collections import Counter
|
7 |
+
import numpy as np
|
8 |
+
import streamlit as st
|
9 |
|
10 |
# Function to load data from SQLite database
|
11 |
+
@st.cache(allow_output_mutation=True)
|
12 |
def load_data(db_file):
|
13 |
conn = sqlite3.connect(db_file)
|
14 |
return conn
|
15 |
|
16 |
# Function to fetch data from database based on query
|
17 |
+
@st.cache(allow_output_mutation=True)
|
18 |
def fetch_data(conn, query):
|
19 |
return pd.read_sql_query(query, conn)
|
20 |
|
21 |
+
# Function to fetch summary info from database
|
22 |
+
@st.cache(allow_output_mutation=True)
|
23 |
+
def fetch_summary_info(conn):
|
24 |
+
# Fetch total count of movies
|
25 |
+
query_total_movies = '''
|
26 |
+
SELECT COUNT(*) as total_movies
|
27 |
FROM title_basics
|
28 |
+
WHERE titleType = 'movie'
|
|
|
|
|
29 |
'''
|
30 |
+
total_movies = fetch_data(conn, query_total_movies).iloc[0]['total_movies']
|
|
|
31 |
|
32 |
+
# Fetch total count of years
|
33 |
+
query_total_years = '''
|
|
|
34 |
SELECT COUNT(DISTINCT startYear) as total_years
|
35 |
FROM title_basics
|
36 |
WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
|
37 |
'''
|
38 |
+
total_years = fetch_data(conn, query_total_years).iloc[0]['total_years']
|
|
|
39 |
|
40 |
+
# Fetch average rating of movies
|
41 |
+
query_avg_rating = '''
|
|
|
42 |
SELECT AVG(averageRating) as avg_rating
|
43 |
FROM title_ratings
|
44 |
'''
|
45 |
+
avg_rating = fetch_data(conn, query_avg_rating).iloc[0]['avg_rating']
|
|
|
46 |
|
47 |
+
return total_movies, total_years, avg_rating
|
48 |
+
|
49 |
+
# Function to plot global map of total films per region
|
50 |
+
@st.cache(allow_output_mutation=True)
|
51 |
+
def plot_global_map(df):
|
52 |
+
# Country code to name mapping (only a few examples shown for brevity)
|
53 |
+
country_mapping = {
|
54 |
+
'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa',
|
55 |
+
# Add more mappings as per your original list
|
56 |
+
}
|
57 |
+
|
58 |
+
# Map country codes to country names
|
59 |
+
df['region'] = df['region'].map(country_mapping)
|
60 |
+
|
61 |
+
# Group by country and count the number of films
|
62 |
+
df_grouped = df.groupby('region').size().reset_index(name='total_films')
|
63 |
+
|
64 |
+
# Apply log transformation to handle outliers
|
65 |
+
df_grouped['log_total_films'] = np.log1p(df_grouped['total_films'])
|
66 |
+
|
67 |
+
# Create a choropleth map with the log-transformed data
|
68 |
+
fig = px.choropleth(df_grouped, locations='region', locationmode='country names',
|
69 |
+
color='log_total_films', hover_name='region',
|
70 |
+
color_continuous_scale='Plasma', # Change the color scheme here
|
71 |
+
labels={'log_total_films': 'Total Films (log scale)'})
|
72 |
+
|
73 |
+
# Update layout of the map
|
74 |
+
fig.update_layout(title='Total Films by Country (Log Scale)',
|
75 |
+
geo=dict(showframe=False, showcoastlines=False,
|
76 |
+
projection_type='equirectangular'))
|
77 |
+
|
78 |
+
return fig
|
79 |
+
|
80 |
+
# Function to create word cloud of genres
|
81 |
+
@st.cache(allow_output_mutation=True)
|
82 |
def create_genre_wordcloud(conn):
|
83 |
query = '''
|
84 |
SELECT genres
|
|
|
86 |
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
87 |
'''
|
88 |
df = fetch_data(conn, query)
|
89 |
+
|
90 |
+
# Process genres
|
91 |
+
genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
|
92 |
+
genre_counts = Counter(genres)
|
93 |
+
|
94 |
+
# Generate the word cloud
|
95 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
|
96 |
+
|
97 |
+
# Display the word cloud using Streamlit
|
98 |
+
st.image(wordcloud.to_array(), use_column_width=True)
|
99 |
|
100 |
+
# Save word cloud to a temporary file (optional)
|
101 |
+
# temp_file = '/tmp/wordcloud.png'
|
102 |
+
# plt.savefig(temp_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
# return temp_file
|
|
|
|
|
105 |
|
106 |
+
# Function to find best movie of each genre by numVotes * averageRating
|
107 |
+
@st.cache(allow_output_mutation=True)
|
108 |
+
def find_best_movies_by_genre(conn):
|
109 |
+
query = '''
|
110 |
+
SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
|
111 |
+
FROM title_basics tb
|
112 |
+
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
113 |
+
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
114 |
+
'''
|
115 |
+
df = fetch_data(conn, query)
|
116 |
|
117 |
+
# Split genres and select the first genre for each movie
|
118 |
+
df['genre'] = df['genres'].str.split(',', expand=True)[0]
|
119 |
+
|
120 |
+
# Calculate score based on numVotes * averageRating
|
121 |
+
df['score'] = df['numVotes'] * df['averageRating']
|
122 |
+
|
123 |
+
# Get the best movie (highest score) for each genre
|
124 |
+
idx = df.groupby('genre')['score'].idxmax()
|
125 |
+
best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
|
126 |
+
.sort_values(by='score', ascending=False).reset_index(drop=True)
|
127 |
+
|
128 |
+
return best_movies_by_genre
|
129 |
+
|
130 |
+
# Main function to orchestrate the dashboard
|
131 |
+
def main():
|
132 |
+
st.title('IMDb Dashboard')
|
133 |
|
134 |
# Load data from SQLite database
|
135 |
db_file = 'imdb_data.db' # Adjust path as needed
|
136 |
conn = load_data(db_file)
|
137 |
|
138 |
+
# Fetch and display summary info
|
139 |
+
total_movies, total_years, avg_rating = fetch_summary_info(conn)
|
140 |
+
st.write(f"Total Movies: {total_movies}")
|
141 |
+
st.write(f"Total Years: {total_years}")
|
142 |
+
st.write(f"Average Rating: {avg_rating:.2f}")
|
143 |
|
144 |
+
# Display global map of total films per region
|
145 |
+
df_movie_region = pd.read_csv('movie_region.csv') # Replace with your actual CSV loading
|
146 |
+
fig = plot_global_map(df_movie_region)
|
147 |
+
st.plotly_chart(fig)
|
148 |
|
149 |
+
# Display word cloud of genres
|
150 |
+
create_genre_wordcloud(conn)
|
|
|
151 |
|
152 |
+
# Find and display the best movie of each genre
|
153 |
+
best_movies_by_genre = find_best_movies_by_genre(conn)
|
154 |
+
st.subheader("Best Movie of Each Genre:")
|
155 |
+
st.write(best_movies_by_genre)
|
156 |
|
157 |
# Close database connection
|
158 |
conn.close()
|
159 |
|
160 |
+
# Execute the main function
|
161 |
+
if __name__ == '__main__':
|
162 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|