Penguni commited on
Commit
7b1a7ea
·
verified ·
1 Parent(s): 0632bea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -71
app.py CHANGED
@@ -15,32 +15,32 @@ def load_data(db_file):
15
  # Function to fetch genre movie releases by year
16
  def fetch_genre_movie_releases(conn):
17
  query = '''
18
- SELECT startYear, genres
19
- FROM title_basics
20
- WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
21
  '''
22
  df = pd.read_sql_query(query, conn)
23
-
24
  # Split genres and explode to separate rows
25
  df['genres'] = df['genres'].str.split(',')
26
  df = df.explode('genres')
27
-
28
  # Convert startYear to numeric
29
  df['startYear'] = pd.to_numeric(df['startYear'])
30
-
31
  # Group by startYear and genre, count the number of movies
32
  genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
33
-
34
  return genre_counts
35
 
36
  # Function to fetch data for filled line chart of movie release years
37
  def fetch_movie_release_years(conn):
38
  query_release_years = '''
39
- SELECT startYear, COUNT(*) as count
40
- FROM title_basics
41
- WHERE titleType = 'movie' AND startYear != '\\N'
42
- GROUP BY startYear
43
- ORDER BY startYear
44
  '''
45
  df_release_years = pd.read_sql_query(query_release_years, conn)
46
  return df_release_years
@@ -48,39 +48,39 @@ def fetch_movie_release_years(conn):
48
  # Function to fetch data and create box plot of average rating by first_genre
49
  def fetch_and_plot_average_rating_by_genre(conn):
50
  query = '''
51
- SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres
52
- FROM title_basics tb
53
- JOIN title_ratings tr ON tb.tconst = tr.tconst
54
- WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
55
  '''
56
  df = pd.read_sql_query(query, conn)
57
-
58
  # Function to extract the first genre from the genres list
59
  def extract_first_genre(genres):
60
  if genres:
61
  return genres.split(',')[0].strip()
62
  else:
63
  return None
64
-
65
  # Apply the function to extract the first genre
66
  df['first_genre'] = df['genres'].apply(extract_first_genre)
67
-
68
  # Drop rows where first_genre is None (shouldn't be necessary if genres column is clean)
69
  df = df.dropna(subset=['first_genre'])
70
-
71
  # Create a box plot of average rating by first_genre
72
  fig = px.box(df, x='first_genre', y='averageRating',
73
  labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
74
  title='Average Rating of Movies by First Genre')
75
-
76
  return fig
77
 
78
  # Function to create word cloud of genres
79
  def create_genre_wordcloud(conn):
80
  query = '''
81
- SELECT genres
82
- FROM title_basics
83
- WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
84
  '''
85
  df = pd.read_sql_query(query, conn)
86
 
@@ -101,10 +101,10 @@ def create_genre_wordcloud(conn):
101
  # Function to find best movie of each genre by numVotes * averageRating
102
  def find_best_movies_by_genre(conn):
103
  query = '''
104
- SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
105
- FROM title_basics tb
106
- JOIN title_ratings tr ON tb.tconst = tr.tconst
107
- WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
108
  '''
109
  df = pd.read_sql_query(query, conn)
110
 
@@ -126,7 +126,7 @@ def plot_genre_movie_releases(genre_counts):
126
  fig = px.line(genre_counts, x='startYear', y='count', color='genres',
127
  title='Genre Movie Releases by Year',
128
  labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
129
-
130
  fig.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
131
  fig.update_xaxes(range=[2000, 2025])
132
  return fig
@@ -186,6 +186,7 @@ def plot_global_map(df):
186
  'ZM': 'Zambia', 'ZW': 'Zimbabwe'
187
  }
188
 
 
189
  # Map the codes to country names
190
  df['country'] = df['country'].map(country_mapping)
191
 
@@ -194,7 +195,7 @@ def plot_global_map(df):
194
  color='total_movies', hover_name='country',
195
  title='Total Films Per Country',
196
  color_continuous_scale=px.colors.sequential.Plasma)
197
-
198
  fig.update_layout(coloraxis_colorbar=dict(title='Total Movies', lenmode='fraction', len=0.7))
199
 
200
  return fig
@@ -202,70 +203,44 @@ def plot_global_map(df):
202
  # Function to run the Streamlit application
203
  def run_app():
204
  st.title('IMDb Movie Dashboard')
205
-
206
  # Connect to SQLite database
207
  conn = load_data('imdb.db')
208
-
209
  # Fetch data for different visualizations
210
  genre_counts = fetch_genre_movie_releases(conn)
211
  df_release_years = fetch_movie_release_years(conn)
212
  best_movies = find_best_movies_by_genre(conn)
213
-
214
  # Create figures for each visualization
215
  fig_genre_releases = plot_genre_movie_releases(genre_counts)
216
  fig_movie_years = plot_movie_release_years(df_release_years)
217
  fig_average_rating = fetch_and_plot_average_rating_by_genre(conn)
218
-
219
- # Create layout for displaying charts
220
  st.header('Genre Movie Releases by Year')
221
  st.plotly_chart(fig_genre_releases, use_container_width=True)
222
-
223
  st.header('Movie Release Years')
224
  st.plotly_chart(fig_movie_years, use_container_width=True)
225
-
226
  st.header('Average Rating by Genre')
227
  st.plotly_chart(fig_average_rating, use_container_width=True)
228
-
229
  st.header('Genre Word Cloud')
230
  create_genre_wordcloud(conn)
231
-
232
  st.header('Best Movies by Genre')
233
  st.dataframe(best_movies)
234
-
235
  st.header('Global Map of Films')
236
- df_global_map = pd.read_csv('movie_region.csv')
237
-
238
  fig_global_map = plot_global_map(df_global_map)
239
-
240
- # Display in Streamlit
241
- st.header('Global Map of Films')
242
  st.plotly_chart(fig_global_map, use_container_width=True)
243
-
244
- # Define the layout using st.columns for a (3,2) grid
245
- col1, col2, col3 = st.columns(3)
246
-
247
- with col1:
248
- st.header('Genre Movie Releases by Year')
249
- st.plotly_chart(fig_genre_releases, use_container_width=True)
250
-
251
- with col2:
252
- st.header('Movie Release Years')
253
- st.plotly_chart(fig_movie_years, use_container_width=True)
254
-
255
- with col3:
256
- st.header('Average Rating by Genre')
257
- st.plotly_chart(fig_average_rating, use_container_width=True)
258
-
259
- col4, col5 = st.columns(2)
260
-
261
- with col4:
262
- st.header('Genre Word Cloud')
263
- create_genre_wordcloud(conn)
264
-
265
- with col5:
266
- st.header('Global Map of Films')
267
- st.plotly_chart(fig_global_map, use_container_width=True)
268
-
269
  # Close connection to database
270
  conn.close()
271
 
 
15
  # Function to fetch genre movie releases by year
16
  def fetch_genre_movie_releases(conn):
17
  query = '''
18
+ SELECT startYear, genres
19
+ FROM title_basics
20
+ WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
21
  '''
22
  df = pd.read_sql_query(query, conn)
23
+
24
  # Split genres and explode to separate rows
25
  df['genres'] = df['genres'].str.split(',')
26
  df = df.explode('genres')
27
+
28
  # Convert startYear to numeric
29
  df['startYear'] = pd.to_numeric(df['startYear'])
30
+
31
  # Group by startYear and genre, count the number of movies
32
  genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
33
+
34
  return genre_counts
35
 
36
  # Function to fetch data for filled line chart of movie release years
37
  def fetch_movie_release_years(conn):
38
  query_release_years = '''
39
+ SELECT startYear, COUNT(*) as count
40
+ FROM title_basics
41
+ WHERE titleType = 'movie' AND startYear != '\\N'
42
+ GROUP BY startYear
43
+ ORDER BY startYear
44
  '''
45
  df_release_years = pd.read_sql_query(query_release_years, conn)
46
  return df_release_years
 
48
  # Function to fetch data and create box plot of average rating by first_genre
49
  def fetch_and_plot_average_rating_by_genre(conn):
50
  query = '''
51
+ SELECT tb.tconst, tb.primaryTitle, tr.averageRating, tb.genres
52
+ FROM title_basics tb
53
+ JOIN title_ratings tr ON tb.tconst = tr.tconst
54
+ WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
55
  '''
56
  df = pd.read_sql_query(query, conn)
57
+
58
  # Function to extract the first genre from the genres list
59
  def extract_first_genre(genres):
60
  if genres:
61
  return genres.split(',')[0].strip()
62
  else:
63
  return None
64
+
65
  # Apply the function to extract the first genre
66
  df['first_genre'] = df['genres'].apply(extract_first_genre)
67
+
68
  # Drop rows where first_genre is None (shouldn't be necessary if genres column is clean)
69
  df = df.dropna(subset=['first_genre'])
70
+
71
  # Create a box plot of average rating by first_genre
72
  fig = px.box(df, x='first_genre', y='averageRating',
73
  labels={'first_genre': 'Genre', 'averageRating': 'Average Rating'},
74
  title='Average Rating of Movies by First Genre')
75
+
76
  return fig
77
 
78
  # Function to create word cloud of genres
79
  def create_genre_wordcloud(conn):
80
  query = '''
81
+ SELECT genres
82
+ FROM title_basics
83
+ WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
84
  '''
85
  df = pd.read_sql_query(query, conn)
86
 
 
101
  # Function to find best movie of each genre by numVotes * averageRating
102
  def find_best_movies_by_genre(conn):
103
  query = '''
104
+ SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
105
+ FROM title_basics tb
106
+ JOIN title_ratings tr ON tb.tconst = tr.tconst
107
+ WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
108
  '''
109
  df = pd.read_sql_query(query, conn)
110
 
 
126
  fig = px.line(genre_counts, x='startYear', y='count', color='genres',
127
  title='Genre Movie Releases by Year',
128
  labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
129
+
130
  fig.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
131
  fig.update_xaxes(range=[2000, 2025])
132
  return fig
 
186
  'ZM': 'Zambia', 'ZW': 'Zimbabwe'
187
  }
188
 
189
+
190
  # Map the codes to country names
191
  df['country'] = df['country'].map(country_mapping)
192
 
 
195
  color='total_movies', hover_name='country',
196
  title='Total Films Per Country',
197
  color_continuous_scale=px.colors.sequential.Plasma)
198
+
199
  fig.update_layout(coloraxis_colorbar=dict(title='Total Movies', lenmode='fraction', len=0.7))
200
 
201
  return fig
 
203
  # Function to run the Streamlit application
204
  def run_app():
205
  st.title('IMDb Movie Dashboard')
206
+
207
  # Connect to SQLite database
208
  conn = load_data('imdb.db')
209
+
210
  # Fetch data for different visualizations
211
  genre_counts = fetch_genre_movie_releases(conn)
212
  df_release_years = fetch_movie_release_years(conn)
213
  best_movies = find_best_movies_by_genre(conn)
214
+
215
  # Create figures for each visualization
216
  fig_genre_releases = plot_genre_movie_releases(genre_counts)
217
  fig_movie_years = plot_movie_release_years(df_release_years)
218
  fig_average_rating = fetch_and_plot_average_rating_by_genre(conn)
219
+
220
+ # Display charts using Streamlit
221
  st.header('Genre Movie Releases by Year')
222
  st.plotly_chart(fig_genre_releases, use_container_width=True)
223
+
224
  st.header('Movie Release Years')
225
  st.plotly_chart(fig_movie_years, use_container_width=True)
226
+
227
  st.header('Average Rating by Genre')
228
  st.plotly_chart(fig_average_rating, use_container_width=True)
229
+
230
  st.header('Genre Word Cloud')
231
  create_genre_wordcloud(conn)
232
+
233
  st.header('Best Movies by Genre')
234
  st.dataframe(best_movies)
235
+
236
  st.header('Global Map of Films')
237
+ df_global_map = pd.read_csv('movie_region.csv') # Assuming you have this CSV file
238
+
239
  fig_global_map = plot_global_map(df_global_map)
240
+
241
+ # Display the global map
 
242
  st.plotly_chart(fig_global_map, use_container_width=True)
243
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  # Close connection to database
245
  conn.close()
246