Penguni commited on
Commit
53c89d1
·
verified ·
1 Parent(s): 1108bbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -100
app.py CHANGED
@@ -13,35 +13,35 @@ def load_data(db_file):
13
  conn = sqlite3.connect(db_file)
14
  return conn
15
  genre_color_map = {
16
- 'Documentary': '#FFB3BA', # Light Pink
17
- 'Animation': '#BAFFC9', # Light Green
18
- 'Comedy': '#FFFFBA', # Light Yellow
19
- 'Short': '#BAE1FF', # Light Blue
20
- 'Romance': '#FFDFBA', # Light Peach
21
- 'News': '#E1BAFF', # Light Purple
22
- 'Drama': '#FFC6C6', # Light Red
23
- 'Fantasy': '#C6FFBA', # Light Lime
24
- 'Horror': '#D3D3D3', # Light Gray
25
- 'Biography': '#FFE4B5', # Moccasin
26
- 'Music': '#B0E0E6', # Powder Blue
27
- 'Crime': '#F0E68C', # Khaki
28
- 'Family': '#98FB98', # Pale Green
29
- 'Action': '#FFA07A', # Light Salmon
30
- 'History': '#DEB887', # Burlywood
31
- 'Adventure': '#87CEFA', # Light Sky Blue
32
- 'Mystery': '#DDA0DD', # Plum
33
- 'Musical': '#FFB6C1', # Light Pink
34
- 'War': '#B0C4DE', # Light Steel Blue
35
- 'Sci-Fi': '#90EE90', # Light Green
36
- 'Western': '#F4A460', # Sandy Brown
37
- 'Thriller': '#FA8072', # Salmon
38
- 'Sport': '#20B2AA', # Light Sea Green
39
- 'Film-Noir': '#778899', # Light Slate Gray
40
- 'Talk-Show': '#FAFAD2', # Light Goldenrod Yellow
41
- 'Game-Show': '#FFC0CB', # Pink
42
- 'Adult': '#DB7093', # Pale Violet Red
43
- 'Reality-TV': '#F08080' # Light Coral
44
- }
45
 
46
  def fetch_genre_movie_releases(conn):
47
  query = r'''
@@ -51,19 +51,15 @@ def fetch_genre_movie_releases(conn):
51
  '''
52
  df = pd.read_sql_query(query, conn)
53
 
54
-
55
  df['genres'] = df['genres'].str.split(',')
56
  df = df.explode('genres')
57
-
58
 
59
  df['startYear'] = pd.to_numeric(df['startYear'])
60
 
61
-
62
  genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
63
 
64
  return genre_counts
65
 
66
-
67
  def fetch_movie_release_years(conn):
68
  query_release_years = r'''
69
  SELECT startYear, COUNT(*) as count
@@ -75,7 +71,6 @@ def fetch_movie_release_years(conn):
75
  df_release_years = pd.read_sql_query(query_release_years, conn)
76
  return df_release_years
77
 
78
-
79
  def fetch_and_plot_average_rating_by_genre(conn):
80
  query = r'''
81
  SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres
@@ -85,31 +80,25 @@ def fetch_and_plot_average_rating_by_genre(conn):
85
  '''
86
  df = pd.read_sql_query(query, conn)
87
 
88
-
89
  def extract_first_genre(genres):
90
  if genres:
91
  return genres.split(',')[0].strip()
92
  else:
93
  return None
94
 
95
-
96
  df['first_genre'] = df['genres'].apply(extract_first_genre)
97
-
98
 
99
  df = df.dropna(subset=['first_genre'])
100
 
101
-
102
  fig = px.box(df, x='first_genre', y='averageRating',
103
  labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
104
  title='Average Rating of Movies by First Genre',
105
  color='first_genre',
106
  color_discrete_map=genre_color_map)
107
 
108
- return fig
109
-
110
 
111
  def genre_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
112
- return genre_color_map.get(word, '
113
 
114
  def create_genre_wordcloud(conn):
115
  query = r'''
@@ -119,20 +108,16 @@ def create_genre_wordcloud(conn):
119
  '''
120
  df = pd.read_sql_query(query, conn)
121
 
122
-
123
  genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
124
  genre_counts = Counter(genres)
125
 
126
-
127
  wordcloud = WordCloud(width=800, height=800, background_color='white', color_func=genre_color_func).generate_from_frequencies(genre_counts)
128
 
129
-
130
  plt.figure(figsize=(10, 10))
131
  plt.imshow(wordcloud, interpolation='bilinear')
132
  plt.axis('off')
133
  st.pyplot(plt.gcf())
134
 
135
-
136
  def find_best_movies_by_genre(conn):
137
  query = r'''
138
  SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
@@ -142,28 +127,25 @@ def find_best_movies_by_genre(conn):
142
  '''
143
  df = pd.read_sql_query(query, conn)
144
 
145
-
146
  df['genre'] = df['genres'].str.split(',', expand=True)[0]
147
 
148
-
149
  df['score'] = df['numVotes'] * df['averageRating']
150
 
151
-
152
  idx = df.groupby('genre')['score'].idxmax()
153
  best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
154
  .sort_values(by='score', ascending=False).reset_index(drop=True)
155
 
156
  return best_movies_by_genre
157
 
158
-
159
  def plot_stacked_genre_movie_releases(genre_counts):
160
  fig = px.area(genre_counts, x='startYear', y='count', color='genres',
161
  title='Stacked Genre Movie Releases by Year',
162
  labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'},
163
- line_group='genres',
164
- hover_name='genres',
165
- hover_data={'count': ':.0f'},
166
- color_discrete_map=genre_color_map)
167
 
168
  return fig
169
 
@@ -171,10 +153,9 @@ def plot_stacked_genre_movie_releases(genre_counts):
171
 
172
 
173
  def plot_global_map(conn):
174
-
175
  movie_region_df = pd.read_csv('movie_region.csv')
176
 
177
-
178
  query_genre = '''
179
  SELECT tconst AS titleId, primaryTitle,
180
  CASE
@@ -184,19 +165,14 @@ def plot_global_map(conn):
184
  FROM title_basics;
185
  '''
186
 
187
-
188
  genre_data_df = pd.read_sql_query(query_genre, conn)
189
 
190
-
191
  merged_df = pd.merge(movie_region_df, genre_data_df, on='titleId', how='inner')
192
 
193
-
194
  df = merged_df.replace('\\N', np.nan).dropna(subset=['first_genre'])
195
 
196
-
197
  grouped = df.groupby('region')['first_genre'].agg(lambda x: ', '.join(x)).reset_index()
198
 
199
-
200
  grouped['genres_list'] = grouped['first_genre'].apply(lambda x: x.split(', '))
201
  grouped['most_common_genre'] = grouped['genres_list'].apply(lambda x: pd.Series(x).value_counts().index[0] if len(x) > 0 else '')
202
 
@@ -251,34 +227,34 @@ def plot_global_map(conn):
251
  }
252
  result.loc[:, 'region'] = result['region'].map(country_mapping)
253
  genre_color_map = {
254
- 'Documentary': '
255
- 'Animation': '
256
- 'Comedy': '
257
- 'Short': '
258
- 'Romance': '
259
- 'News': '
260
- 'Drama': '
261
- 'Fantasy': '
262
- 'Horror': '
263
- 'Biography': '
264
- 'Music': '
265
- 'Crime': '
266
- 'Family': '
267
- 'Action': '
268
- 'History': '
269
- 'Adventure': '
270
- 'Mystery': '
271
- 'Musical': '
272
- 'War': '
273
- 'Sci-Fi': '
274
- 'Western': '
275
- 'Thriller': '
276
- 'Sport': '
277
- 'Film-Noir': '
278
- 'Talk-Show': '
279
- 'Game-Show': '
280
- 'Adult': '
281
- 'Reality-TV': '
282
  }
283
  fig = px.choropleth(
284
  result,
@@ -291,15 +267,15 @@ def plot_global_map(conn):
291
  color_discrete_map=genre_color_map,
292
  )
293
 
294
-
295
  fig.update_layout(
296
  geo=dict(showframe=False, showcoastlines=True, projection_type='natural earth')
297
  )
298
  return fig
299
 
300
-
301
  def fetch_summary_info(conn):
302
-
303
  query_total_movies = r'''
304
  SELECT COUNT(*) as total_movies
305
  FROM title_basics
@@ -307,7 +283,7 @@ def fetch_summary_info(conn):
307
  '''
308
  total_movies = pd.read_sql_query(query_total_movies, conn).iloc[0]['total_movies']
309
 
310
-
311
  query_total_years = r'''
312
  SELECT COUNT(DISTINCT startYear) as total_years
313
  FROM title_basics
@@ -315,7 +291,7 @@ def fetch_summary_info(conn):
315
  '''
316
  total_years = pd.read_sql_query(query_total_years, conn).iloc[0]['total_years']
317
 
318
-
319
  query_avg_rating = r'''
320
  SELECT AVG(averageRating) as avg_rating
321
  FROM title_ratings
@@ -324,16 +300,16 @@ def fetch_summary_info(conn):
324
 
325
  return total_movies, total_years, avg_rating
326
 
327
-
328
  def run_app():
329
  st.title('IMDb Movie Data Analysis')
330
 
331
-
332
  conn = load_data('imdb_data.db')
333
  genre_counts = fetch_genre_movie_releases(conn)
334
  total_movies, total_years, avg_rating = fetch_summary_info(conn)
335
 
336
-
337
  col1, col2, col3 = st.columns(3)
338
 
339
  with col1:
@@ -348,12 +324,12 @@ def run_app():
348
  st.subheader('Average Movie Rating')
349
  st.metric(label='zzz', value=f'{avg_rating:.2f}')
350
 
351
-
352
  best_movies_by_genre = find_best_movies_by_genre(conn)
353
  fig_global_map = plot_global_map(conn)
354
  fig_genre_movie_releases = plot_stacked_genre_movie_releases(genre_counts)
355
 
356
-
357
  col1, col2 = st.columns(2)
358
 
359
  with col1:
@@ -366,7 +342,7 @@ def run_app():
366
 
367
  fig_avg_rating_by_genre = fetch_and_plot_average_rating_by_genre(conn)
368
 
369
-
370
  col1, col2, col3 = st.columns(3)
371
 
372
  with col1:
@@ -380,7 +356,7 @@ def run_app():
380
  st.subheader('Average Rating by Genre')
381
  st.plotly_chart(fig_avg_rating_by_genre, use_container_width=True)
382
 
383
-
384
  conn.close()
385
 
386
  if __name__ == '__main__':
 
13
  conn = sqlite3.connect(db_file)
14
  return conn
15
  genre_color_map = {
16
+ 'Documentary': '#FFB3BA', # Light Pink
17
+ 'Animation': '#BAFFC9', # Light Green
18
+ 'Comedy': '#FFFFBA', # Light Yellow
19
+ 'Short': '#BAE1FF', # Light Blue
20
+ 'Romance': '#FFDFBA', # Light Peach
21
+ 'News': '#E1BAFF', # Light Purple
22
+ 'Drama': '#FFC6C6', # Light Red
23
+ 'Fantasy': '#C6FFBA', # Light Lime
24
+ 'Horror': '#D3D3D3', # Light Gray
25
+ 'Biography': '#FFE4B5', # Moccasin
26
+ 'Music': '#B0E0E6', # Powder Blue
27
+ 'Crime': '#F0E68C', # Khaki
28
+ 'Family': '#98FB98', # Pale Green
29
+ 'Action': '#FFA07A', # Light Salmon
30
+ 'History': '#DEB887', # Burlywood
31
+ 'Adventure': '#87CEFA', # Light Sky Blue
32
+ 'Mystery': '#DDA0DD', # Plum
33
+ 'Musical': '#FFB6C1', # Light Pink
34
+ 'War': '#B0C4DE', # Light Steel Blue
35
+ 'Sci-Fi': '#90EE90', # Light Green
36
+ 'Western': '#F4A460', # Sandy Brown
37
+ 'Thriller': '#FA8072', # Salmon
38
+ 'Sport': '#20B2AA', # Light Sea Green
39
+ 'Film-Noir': '#778899', # Light Slate Gray
40
+ 'Talk-Show': '#FAFAD2', # Light Goldenrod Yellow
41
+ 'Game-Show': '#FFC0CB', # Pink
42
+ 'Adult': '#DB7093', # Pale Violet Red
43
+ 'Reality-TV': '#F08080' # Light Coral
44
+ }
45
 
46
  def fetch_genre_movie_releases(conn):
47
  query = r'''
 
51
  '''
52
  df = pd.read_sql_query(query, conn)
53
 
 
54
  df['genres'] = df['genres'].str.split(',')
55
  df = df.explode('genres')
 
56
 
57
  df['startYear'] = pd.to_numeric(df['startYear'])
58
 
 
59
  genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
60
 
61
  return genre_counts
62
 
 
63
  def fetch_movie_release_years(conn):
64
  query_release_years = r'''
65
  SELECT startYear, COUNT(*) as count
 
71
  df_release_years = pd.read_sql_query(query_release_years, conn)
72
  return df_release_years
73
 
 
74
  def fetch_and_plot_average_rating_by_genre(conn):
75
  query = r'''
76
  SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres
 
80
  '''
81
  df = pd.read_sql_query(query, conn)
82
 
 
83
  def extract_first_genre(genres):
84
  if genres:
85
  return genres.split(',')[0].strip()
86
  else:
87
  return None
88
 
 
89
  df['first_genre'] = df['genres'].apply(extract_first_genre)
 
90
 
91
  df = df.dropna(subset=['first_genre'])
92
 
 
93
  fig = px.box(df, x='first_genre', y='averageRating',
94
  labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
95
  title='Average Rating of Movies by First Genre',
96
  color='first_genre',
97
  color_discrete_map=genre_color_map)
98
 
 
 
99
 
100
  def genre_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
101
+ return genre_color_map.get(word, '#FFFFFF')
102
 
103
  def create_genre_wordcloud(conn):
104
  query = r'''
 
108
  '''
109
  df = pd.read_sql_query(query, conn)
110
 
 
111
  genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
112
  genre_counts = Counter(genres)
113
 
 
114
  wordcloud = WordCloud(width=800, height=800, background_color='white', color_func=genre_color_func).generate_from_frequencies(genre_counts)
115
 
 
116
  plt.figure(figsize=(10, 10))
117
  plt.imshow(wordcloud, interpolation='bilinear')
118
  plt.axis('off')
119
  st.pyplot(plt.gcf())
120
 
 
121
  def find_best_movies_by_genre(conn):
122
  query = r'''
123
  SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
 
127
  '''
128
  df = pd.read_sql_query(query, conn)
129
 
 
130
  df['genre'] = df['genres'].str.split(',', expand=True)[0]
131
 
132
+
133
  df['score'] = df['numVotes'] * df['averageRating']
134
 
 
135
  idx = df.groupby('genre')['score'].idxmax()
136
  best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
137
  .sort_values(by='score', ascending=False).reset_index(drop=True)
138
 
139
  return best_movies_by_genre
140
 
 
141
  def plot_stacked_genre_movie_releases(genre_counts):
142
  fig = px.area(genre_counts, x='startYear', y='count', color='genres',
143
  title='Stacked Genre Movie Releases by Year',
144
  labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'},
145
+ line_group='genres', # This groups lines by genre
146
+ hover_name='genres', # This sets the genre as the hover label
147
+ hover_data={'count': ':.0f'}, # Format hover data as integer
148
+ color_discrete_map=genre_color_map) # Apply color map
149
 
150
  return fig
151
 
 
153
 
154
 
155
  def plot_global_map(conn):
 
156
  movie_region_df = pd.read_csv('movie_region.csv')
157
 
158
+ # SQL query to get unique first genre of each title
159
  query_genre = '''
160
  SELECT tconst AS titleId, primaryTitle,
161
  CASE
 
165
  FROM title_basics;
166
  '''
167
 
 
168
  genre_data_df = pd.read_sql_query(query_genre, conn)
169
 
 
170
  merged_df = pd.merge(movie_region_df, genre_data_df, on='titleId', how='inner')
171
 
 
172
  df = merged_df.replace('\\N', np.nan).dropna(subset=['first_genre'])
173
 
 
174
  grouped = df.groupby('region')['first_genre'].agg(lambda x: ', '.join(x)).reset_index()
175
 
 
176
  grouped['genres_list'] = grouped['first_genre'].apply(lambda x: x.split(', '))
177
  grouped['most_common_genre'] = grouped['genres_list'].apply(lambda x: pd.Series(x).value_counts().index[0] if len(x) > 0 else '')
178
 
 
227
  }
228
  result.loc[:, 'region'] = result['region'].map(country_mapping)
229
  genre_color_map = {
230
+ 'Documentary': '#FFB3BA', # Light Pink
231
+ 'Animation': '#BAFFC9', # Light Green
232
+ 'Comedy': '#FFFFBA', # Light Yellow
233
+ 'Short': '#BAE1FF', # Light Blue
234
+ 'Romance': '#FFDFBA', # Light Peach
235
+ 'News': '#E1BAFF', # Light Purple
236
+ 'Drama': '#FFC6C6', # Light Red
237
+ 'Fantasy': '#C6FFBA', # Light Lime
238
+ 'Horror': '#D3D3D3', # Light Gray
239
+ 'Biography': '#FFE4B5', # Moccasin
240
+ 'Music': '#B0E0E6', # Powder Blue
241
+ 'Crime': '#F0E68C', # Khaki
242
+ 'Family': '#98FB98', # Pale Green
243
+ 'Action': '#FFA07A', # Light Salmon
244
+ 'History': '#DEB887', # Burlywood
245
+ 'Adventure': '#87CEFA', # Light Sky Blue
246
+ 'Mystery': '#DDA0DD', # Plum
247
+ 'Musical': '#FFB6C1', # Light Pink
248
+ 'War': '#B0C4DE', # Light Steel Blue
249
+ 'Sci-Fi': '#90EE90', # Light Green
250
+ 'Western': '#F4A460', # Sandy Brown
251
+ 'Thriller': '#FA8072', # Salmon
252
+ 'Sport': '#20B2AA', # Light Sea Green
253
+ 'Film-Noir': '#778899', # Light Slate Gray
254
+ 'Talk-Show': '#FAFAD2', # Light Goldenrod Yellow
255
+ 'Game-Show': '#FFC0CB', # Pink
256
+ 'Adult': '#DB7093', # Pale Violet Red
257
+ 'Reality-TV': '#F08080' # Light Coral
258
  }
259
  fig = px.choropleth(
260
  result,
 
267
  color_discrete_map=genre_color_map,
268
  )
269
 
270
+ # Update the layout
271
  fig.update_layout(
272
  geo=dict(showframe=False, showcoastlines=True, projection_type='natural earth')
273
  )
274
  return fig
275
 
276
+ # Function to fetch summary info
277
  def fetch_summary_info(conn):
278
+ # Fetch total count of movies
279
  query_total_movies = r'''
280
  SELECT COUNT(*) as total_movies
281
  FROM title_basics
 
283
  '''
284
  total_movies = pd.read_sql_query(query_total_movies, conn).iloc[0]['total_movies']
285
 
286
+ # Fetch total count of years
287
  query_total_years = r'''
288
  SELECT COUNT(DISTINCT startYear) as total_years
289
  FROM title_basics
 
291
  '''
292
  total_years = pd.read_sql_query(query_total_years, conn).iloc[0]['total_years']
293
 
294
+ # Fetch average rating of movies
295
  query_avg_rating = r'''
296
  SELECT AVG(averageRating) as avg_rating
297
  FROM title_ratings
 
300
 
301
  return total_movies, total_years, avg_rating
302
 
303
+ # Main Streamlit app
304
  def run_app():
305
  st.title('IMDb Movie Data Analysis')
306
 
307
+ # Load data from SQLite database
308
  conn = load_data('imdb_data.db')
309
  genre_counts = fetch_genre_movie_releases(conn)
310
  total_movies, total_years, avg_rating = fetch_summary_info(conn)
311
 
312
+ # Layout for summary info in three columns
313
  col1, col2, col3 = st.columns(3)
314
 
315
  with col1:
 
324
  st.subheader('Average Movie Rating')
325
  st.metric(label='zzz', value=f'{avg_rating:.2f}')
326
 
327
+ # Find and display best movies by genre
328
  best_movies_by_genre = find_best_movies_by_genre(conn)
329
  fig_global_map = plot_global_map(conn)
330
  fig_genre_movie_releases = plot_stacked_genre_movie_releases(genre_counts)
331
 
332
+ # Layout for best movies by genre in two columns
333
  col1, col2 = st.columns(2)
334
 
335
  with col1:
 
342
 
343
  fig_avg_rating_by_genre = fetch_and_plot_average_rating_by_genre(conn)
344
 
345
+ # Layout for Plotly charts in three columns
346
  col1, col2, col3 = st.columns(3)
347
 
348
  with col1:
 
356
  st.subheader('Average Rating by Genre')
357
  st.plotly_chart(fig_avg_rating_by_genre, use_container_width=True)
358
 
359
+ # Close database connection
360
  conn.close()
361
 
362
  if __name__ == '__main__':