Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,32 +15,32 @@ def load_data(db_file):
|
|
15 |
# Function to fetch genre movie releases by year
|
16 |
def fetch_genre_movie_releases(conn):
|
17 |
query = '''
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
'''
|
22 |
df = pd.read_sql_query(query, conn)
|
23 |
-
|
24 |
# Split genres and explode to separate rows
|
25 |
df['genres'] = df['genres'].str.split(',')
|
26 |
df = df.explode('genres')
|
27 |
-
|
28 |
# Convert startYear to numeric
|
29 |
df['startYear'] = pd.to_numeric(df['startYear'])
|
30 |
-
|
31 |
# Group by startYear and genre, count the number of movies
|
32 |
genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
|
33 |
-
|
34 |
return genre_counts
|
35 |
|
36 |
# Function to fetch data for filled line chart of movie release years
|
37 |
def fetch_movie_release_years(conn):
|
38 |
query_release_years = '''
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
'''
|
45 |
df_release_years = pd.read_sql_query(query_release_years, conn)
|
46 |
return df_release_years
|
@@ -48,39 +48,39 @@ def fetch_movie_release_years(conn):
|
|
48 |
# Function to fetch data and create box plot of average rating by first_genre
|
49 |
def fetch_and_plot_average_rating_by_genre(conn):
|
50 |
query = '''
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
'''
|
56 |
df = pd.read_sql_query(query, conn)
|
57 |
-
|
58 |
# Function to extract the first genre from the genres list
|
59 |
def extract_first_genre(genres):
|
60 |
if genres:
|
61 |
return genres.split(',')[0].strip()
|
62 |
else:
|
63 |
return None
|
64 |
-
|
65 |
# Apply the function to extract the first genre
|
66 |
df['first_genre'] = df['genres'].apply(extract_first_genre)
|
67 |
-
|
68 |
# Drop rows where first_genre is None (shouldn't be necessary if genres column is clean)
|
69 |
df = df.dropna(subset=['first_genre'])
|
70 |
-
|
71 |
# Create a box plot of average rating by first_genre
|
72 |
fig = px.box(df, x='first_genre', y='averageRating',
|
73 |
labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
|
74 |
title='Average Rating of Movies by First Genre')
|
75 |
-
|
76 |
return fig
|
77 |
|
78 |
# Function to create word cloud of genres
|
79 |
def create_genre_wordcloud(conn):
|
80 |
query = '''
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
'''
|
85 |
df = pd.read_sql_query(query, conn)
|
86 |
|
@@ -101,10 +101,10 @@ def create_genre_wordcloud(conn):
|
|
101 |
# Function to find best movie of each genre by numVotes * averageRating
|
102 |
def find_best_movies_by_genre(conn):
|
103 |
query = '''
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
'''
|
109 |
df = pd.read_sql_query(query, conn)
|
110 |
|
@@ -126,7 +126,7 @@ def plot_genre_movie_releases(genre_counts):
|
|
126 |
fig = px.line(genre_counts, x='startYear', y='count', color='genres',
|
127 |
title='Genre Movie Releases by Year',
|
128 |
labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
|
129 |
-
|
130 |
fig.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
|
131 |
fig.update_xaxes(range=[2000, 2025])
|
132 |
return fig
|
@@ -186,6 +186,7 @@ def plot_global_map(df):
|
|
186 |
'ZM': 'Zambia', 'ZW': 'Zimbabwe'
|
187 |
}
|
188 |
|
|
|
189 |
# Map the codes to country names
|
190 |
df['country'] = df['country'].map(country_mapping)
|
191 |
|
@@ -194,7 +195,7 @@ def plot_global_map(df):
|
|
194 |
color='total_movies', hover_name='country',
|
195 |
title='Total Films Per Country',
|
196 |
color_continuous_scale=px.colors.sequential.Plasma)
|
197 |
-
|
198 |
fig.update_layout(coloraxis_colorbar=dict(title='Total Movies', lenmode='fraction', len=0.7))
|
199 |
|
200 |
return fig
|
@@ -202,70 +203,44 @@ def plot_global_map(df):
|
|
202 |
# Function to run the Streamlit application
|
203 |
def run_app():
|
204 |
st.title('IMDb Movie Dashboard')
|
205 |
-
|
206 |
# Connect to SQLite database
|
207 |
conn = load_data('imdb.db')
|
208 |
-
|
209 |
# Fetch data for different visualizations
|
210 |
genre_counts = fetch_genre_movie_releases(conn)
|
211 |
df_release_years = fetch_movie_release_years(conn)
|
212 |
best_movies = find_best_movies_by_genre(conn)
|
213 |
-
|
214 |
# Create figures for each visualization
|
215 |
fig_genre_releases = plot_genre_movie_releases(genre_counts)
|
216 |
fig_movie_years = plot_movie_release_years(df_release_years)
|
217 |
fig_average_rating = fetch_and_plot_average_rating_by_genre(conn)
|
218 |
-
|
219 |
-
#
|
220 |
st.header('Genre Movie Releases by Year')
|
221 |
st.plotly_chart(fig_genre_releases, use_container_width=True)
|
222 |
-
|
223 |
st.header('Movie Release Years')
|
224 |
st.plotly_chart(fig_movie_years, use_container_width=True)
|
225 |
-
|
226 |
st.header('Average Rating by Genre')
|
227 |
st.plotly_chart(fig_average_rating, use_container_width=True)
|
228 |
-
|
229 |
st.header('Genre Word Cloud')
|
230 |
create_genre_wordcloud(conn)
|
231 |
-
|
232 |
st.header('Best Movies by Genre')
|
233 |
st.dataframe(best_movies)
|
234 |
-
|
235 |
st.header('Global Map of Films')
|
236 |
-
df_global_map = pd.read_csv('movie_region.csv')
|
237 |
-
|
238 |
fig_global_map = plot_global_map(df_global_map)
|
239 |
-
|
240 |
-
# Display
|
241 |
-
st.header('Global Map of Films')
|
242 |
st.plotly_chart(fig_global_map, use_container_width=True)
|
243 |
-
|
244 |
-
# Define the layout using st.columns for a (3,2) grid
|
245 |
-
col1, col2, col3 = st.columns(3)
|
246 |
-
|
247 |
-
with col1:
|
248 |
-
st.header('Genre Movie Releases by Year')
|
249 |
-
st.plotly_chart(fig_genre_releases, use_container_width=True)
|
250 |
-
|
251 |
-
with col2:
|
252 |
-
st.header('Movie Release Years')
|
253 |
-
st.plotly_chart(fig_movie_years, use_container_width=True)
|
254 |
-
|
255 |
-
with col3:
|
256 |
-
st.header('Average Rating by Genre')
|
257 |
-
st.plotly_chart(fig_average_rating, use_container_width=True)
|
258 |
-
|
259 |
-
col4, col5 = st.columns(2)
|
260 |
-
|
261 |
-
with col4:
|
262 |
-
st.header('Genre Word Cloud')
|
263 |
-
create_genre_wordcloud(conn)
|
264 |
-
|
265 |
-
with col5:
|
266 |
-
st.header('Global Map of Films')
|
267 |
-
st.plotly_chart(fig_global_map, use_container_width=True)
|
268 |
-
|
269 |
# Close connection to database
|
270 |
conn.close()
|
271 |
|
|
|
15 |
# Function to fetch genre movie releases by year
|
16 |
def fetch_genre_movie_releases(conn):
|
17 |
query = '''
|
18 |
+
SELECT startYear, genres
|
19 |
+
FROM title_basics
|
20 |
+
WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
|
21 |
'''
|
22 |
df = pd.read_sql_query(query, conn)
|
23 |
+
|
24 |
# Split genres and explode to separate rows
|
25 |
df['genres'] = df['genres'].str.split(',')
|
26 |
df = df.explode('genres')
|
27 |
+
|
28 |
# Convert startYear to numeric
|
29 |
df['startYear'] = pd.to_numeric(df['startYear'])
|
30 |
+
|
31 |
# Group by startYear and genre, count the number of movies
|
32 |
genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
|
33 |
+
|
34 |
return genre_counts
|
35 |
|
36 |
# Function to fetch data for filled line chart of movie release years
|
37 |
def fetch_movie_release_years(conn):
|
38 |
query_release_years = '''
|
39 |
+
SELECT startYear, COUNT(*) as count
|
40 |
+
FROM title_basics
|
41 |
+
WHERE titleType = 'movie' AND startYear != '\\N'
|
42 |
+
GROUP BY startYear
|
43 |
+
ORDER BY startYear
|
44 |
'''
|
45 |
df_release_years = pd.read_sql_query(query_release_years, conn)
|
46 |
return df_release_years
|
|
|
48 |
# Function to fetch data and create box plot of average rating by first_genre
|
49 |
def fetch_and_plot_average_rating_by_genre(conn):
|
50 |
query = '''
|
51 |
+
SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres
|
52 |
+
FROM title_basics tb
|
53 |
+
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
54 |
+
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
55 |
'''
|
56 |
df = pd.read_sql_query(query, conn)
|
57 |
+
|
58 |
# Function to extract the first genre from the genres list
|
59 |
def extract_first_genre(genres):
|
60 |
if genres:
|
61 |
return genres.split(',')[0].strip()
|
62 |
else:
|
63 |
return None
|
64 |
+
|
65 |
# Apply the function to extract the first genre
|
66 |
df['first_genre'] = df['genres'].apply(extract_first_genre)
|
67 |
+
|
68 |
# Drop rows where first_genre is None (shouldn't be necessary if genres column is clean)
|
69 |
df = df.dropna(subset=['first_genre'])
|
70 |
+
|
71 |
# Create a box plot of average rating by first_genre
|
72 |
fig = px.box(df, x='first_genre', y='averageRating',
|
73 |
labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
|
74 |
title='Average Rating of Movies by First Genre')
|
75 |
+
|
76 |
return fig
|
77 |
|
78 |
# Function to create word cloud of genres
|
79 |
def create_genre_wordcloud(conn):
|
80 |
query = '''
|
81 |
+
SELECT genres
|
82 |
+
FROM title_basics
|
83 |
+
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
84 |
'''
|
85 |
df = pd.read_sql_query(query, conn)
|
86 |
|
|
|
101 |
# Function to find best movie of each genre by numVotes * averageRating
|
102 |
def find_best_movies_by_genre(conn):
|
103 |
query = '''
|
104 |
+
SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
|
105 |
+
FROM title_basics tb
|
106 |
+
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
107 |
+
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
108 |
'''
|
109 |
df = pd.read_sql_query(query, conn)
|
110 |
|
|
|
126 |
fig = px.line(genre_counts, x='startYear', y='count', color='genres',
|
127 |
title='Genre Movie Releases by Year',
|
128 |
labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
|
129 |
+
|
130 |
fig.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
|
131 |
fig.update_xaxes(range=[2000, 2025])
|
132 |
return fig
|
|
|
186 |
'ZM': 'Zambia', 'ZW': 'Zimbabwe'
|
187 |
}
|
188 |
|
189 |
+
|
190 |
# Map the codes to country names
|
191 |
df['country'] = df['country'].map(country_mapping)
|
192 |
|
|
|
195 |
color='total_movies', hover_name='country',
|
196 |
title='Total Films Per Country',
|
197 |
color_continuous_scale=px.colors.sequential.Plasma)
|
198 |
+
|
199 |
fig.update_layout(coloraxis_colorbar=dict(title='Total Movies', lenmode='fraction', len=0.7))
|
200 |
|
201 |
return fig
|
|
|
203 |
# Function to run the Streamlit application
|
204 |
def run_app():
|
205 |
st.title('IMDb Movie Dashboard')
|
206 |
+
|
207 |
# Connect to SQLite database
|
208 |
conn = load_data('imdb.db')
|
209 |
+
|
210 |
# Fetch data for different visualizations
|
211 |
genre_counts = fetch_genre_movie_releases(conn)
|
212 |
df_release_years = fetch_movie_release_years(conn)
|
213 |
best_movies = find_best_movies_by_genre(conn)
|
214 |
+
|
215 |
# Create figures for each visualization
|
216 |
fig_genre_releases = plot_genre_movie_releases(genre_counts)
|
217 |
fig_movie_years = plot_movie_release_years(df_release_years)
|
218 |
fig_average_rating = fetch_and_plot_average_rating_by_genre(conn)
|
219 |
+
|
220 |
+
# Display charts using Streamlit
|
221 |
st.header('Genre Movie Releases by Year')
|
222 |
st.plotly_chart(fig_genre_releases, use_container_width=True)
|
223 |
+
|
224 |
st.header('Movie Release Years')
|
225 |
st.plotly_chart(fig_movie_years, use_container_width=True)
|
226 |
+
|
227 |
st.header('Average Rating by Genre')
|
228 |
st.plotly_chart(fig_average_rating, use_container_width=True)
|
229 |
+
|
230 |
st.header('Genre Word Cloud')
|
231 |
create_genre_wordcloud(conn)
|
232 |
+
|
233 |
st.header('Best Movies by Genre')
|
234 |
st.dataframe(best_movies)
|
235 |
+
|
236 |
st.header('Global Map of Films')
|
237 |
+
df_global_map = pd.read_csv('movie_region.csv') # Assuming you have this CSV file
|
238 |
+
|
239 |
fig_global_map = plot_global_map(df_global_map)
|
240 |
+
|
241 |
+
# Display the global map
|
|
|
242 |
st.plotly_chart(fig_global_map, use_container_width=True)
|
243 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
# Close connection to database
|
245 |
conn.close()
|
246 |
|